scirs2_text/
string_metrics.rs

1//! String metrics module for distance calculations and phonetic algorithms.
2
3use crate::Result;
4use std::cmp::{max, min};
5use std::collections::HashMap;
6
7/// Trait for string distance metrics
8pub trait StringMetric {
9    /// Calculate the distance between two strings
10    fn distance(&self, s1: &str, s2: &str) -> Result<f64>;
11
12    /// Calculate normalized distance (0.0 to 1.0)
13    fn normalized_distance(&self, s1: &str, s2: &str) -> Result<f64> {
14        let dist = self.distance(s1, s2)?;
15        let max_len = max(s1.len(), s2.len()) as f64;
16        Ok(if max_len > 0.0 { dist / max_len } else { 0.0 })
17    }
18
19    /// Calculate similarity (1.0 - normalized distance)
20    fn similarity(&self, s1: &str, s2: &str) -> Result<f64> {
21        Ok(1.0 - self.normalized_distance(s1, s2)?)
22    }
23}
24
25/// Damerau-Levenshtein distance metric
26#[derive(Debug, Clone)]
27pub struct DamerauLevenshteinMetric {
28    /// Whether to use restricted Damerau-Levenshtein (no substring transpositions)
29    restricted: bool,
30}
31
32impl DamerauLevenshteinMetric {
33    /// Create a new Damerau-Levenshtein metric
34    pub fn new() -> Self {
35        Self { restricted: false }
36    }
37
38    /// Create a restricted variant (OSA - Optimal String Alignment)
39    pub fn restricted() -> Self {
40        Self { restricted: true }
41    }
42}
43
44impl Default for DamerauLevenshteinMetric {
45    fn default() -> Self {
46        Self::new()
47    }
48}
49
50impl StringMetric for DamerauLevenshteinMetric {
51    fn distance(&self, s1: &str, s2: &str) -> Result<f64> {
52        if self.restricted {
53            Ok(osa_distance(s1, s2) as f64)
54        } else {
55            Ok(damerau_levenshtein_distance(s1, s2) as f64)
56        }
57    }
58}
59
60/// Calculate restricted Damerau-Levenshtein (OSA) distance
61#[allow(dead_code)]
62fn osa_distance(s1: &str, s2: &str) -> usize {
63    let a: Vec<char> = s1.chars().collect();
64    let b: Vec<char> = s2.chars().collect();
65    let len_a = a.len();
66    let len_b = b.len();
67
68    if len_a == 0 {
69        return len_b;
70    }
71    if len_b == 0 {
72        return len_a;
73    }
74
75    let mut matrix = vec![vec![0; len_b + 1]; len_a + 1];
76
77    for (i, row) in matrix.iter_mut().enumerate().take(len_a + 1) {
78        row[0] = i;
79    }
80    // For the first row, set each column index as the value
81    for j in 0..=len_b {
82        matrix[0][j] = j;
83    }
84
85    for i in 1..=len_a {
86        for j in 1..=len_b {
87            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
88
89            matrix[i][j] = min(
90                min(
91                    matrix[i - 1][j] + 1, // deletion
92                    matrix[i][j - 1] + 1, // insertion
93                ),
94                matrix[i - 1][j - 1] + cost, // substitution
95            );
96
97            // Transposition
98            if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1] {
99                matrix[i][j] = min(matrix[i][j], matrix[i - 2][j - 2] + cost);
100            }
101        }
102    }
103
104    matrix[len_a][len_b]
105}
106
107/// Calculate full Damerau-Levenshtein distance
108#[allow(dead_code)]
109fn damerau_levenshtein_distance(s1: &str, s2: &str) -> usize {
110    let a: Vec<char> = s1.chars().collect();
111    let b: Vec<char> = s2.chars().collect();
112    let len_a = a.len();
113    let len_b = b.len();
114
115    if len_a == 0 {
116        return len_b;
117    }
118    if len_b == 0 {
119        return len_a;
120    }
121
122    let max_dist = len_a + len_b;
123    let mut h = vec![vec![max_dist; len_b + 2]; len_a + 2];
124    let mut da: HashMap<char, usize> = HashMap::new();
125
126    for i in 0..=len_a {
127        h[i + 1][0] = max_dist;
128        h[i + 1][1] = i;
129    }
130    for j in 0..=len_b {
131        h[0][j + 1] = max_dist;
132        h[1][j + 1] = j;
133    }
134
135    for i in 1..=len_a {
136        let mut db = 0;
137        for j in 1..=len_b {
138            let k = da.get(&b[j - 1]).copied().unwrap_or(0);
139            let l = db;
140            let cost = if a[i - 1] == b[j - 1] {
141                db = j;
142                0
143            } else {
144                1
145            };
146
147            h[i + 1][j + 1] = min(
148                min(
149                    h[i][j] + cost,  // substitution
150                    h[i + 1][j] + 1, // insertion
151                ),
152                min(
153                    h[i][j + 1] + 1,                         // deletion
154                    h[k][l] + (i - k - 1) + 1 + (j - l - 1), // transposition
155                ),
156            );
157        }
158
159        da.insert(a[i - 1], i);
160    }
161
162    h[len_a + 1][len_b + 1]
163}
164
165/// Phonetic algorithms trait
166pub trait PhoneticAlgorithm {
167    /// Encode a string using the phonetic algorithm
168    fn encode(&self, text: &str) -> Result<String>;
169
170    /// Check if two strings are phonetically similar
171    fn sounds_like(&self, s1: &str, s2: &str) -> Result<bool> {
172        Ok(self.encode(s1)? == self.encode(s2)?)
173    }
174}
175
176/// Soundex phonetic algorithm
177#[derive(Debug, Clone)]
178pub struct Soundex {
179    /// Length of the code (standard is 4)
180    length: usize,
181}
182
183impl Soundex {
184    /// Create new Soundex encoder with standard length (4)
185    pub fn new() -> Self {
186        Self { length: 4 }
187    }
188
189    /// Create with custom code length
190    pub fn with_length(length: usize) -> Self {
191        Self { length }
192    }
193}
194
195impl Default for Soundex {
196    fn default() -> Self {
197        Self::new()
198    }
199}
200
201impl PhoneticAlgorithm for Soundex {
202    fn encode(&self, text: &str) -> Result<String> {
203        if text.is_empty() {
204            return Ok(String::new());
205        }
206
207        let text = text.to_uppercase();
208        let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
209
210        if chars.is_empty() {
211            return Ok(String::new());
212        }
213
214        let mut code = String::new();
215        code.push(chars[0]);
216
217        let mut last_code = encode_char(chars[0]);
218
219        for &ch in &chars[1..] {
220            let ch_code = encode_char(ch);
221            if ch_code != '0' && ch_code != last_code {
222                code.push(ch_code);
223                last_code = ch_code;
224            }
225
226            if code.len() >= self.length {
227                break;
228            }
229        }
230
231        // Pad with zeros if necessary
232        while code.len() < self.length {
233            code.push('0');
234        }
235
236        Ok(code)
237    }
238}
239
240/// Encode a character for Soundex
241#[allow(dead_code)]
242fn encode_char(ch: char) -> char {
243    match ch.to_ascii_uppercase() {
244        'B' | 'F' | 'P' | 'V' => '1',
245        'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
246        'D' | 'T' => '3',
247        'L' => '4',
248        'M' | 'N' => '5',
249        'R' => '6',
250        _ => '0',
251    }
252}
253
254/// Metaphone phonetic algorithm
255#[derive(Debug, Clone)]
256pub struct Metaphone {
257    /// Maximum length of the code
258    max_length: usize,
259}
260
261/// NYSIIS (New York State Identification and Intelligence System) phonetic algorithm
262#[derive(Debug, Clone)]
263pub struct Nysiis {
264    /// Maximum length of the code (0 for no limit)
265    max_length: usize,
266}
267
268/// Needleman-Wunsch global sequence alignment algorithm
269#[derive(Debug, Clone)]
270pub struct NeedlemanWunsch {
271    /// Score for matching characters
272    match_score: i32,
273    /// Penalty for mismatching characters
274    mismatch_penalty: i32,
275    /// Penalty for gaps
276    gap_penalty: i32,
277}
278
279/// Result of sequence alignment
280#[derive(Debug, Clone, PartialEq)]
281pub struct AlignmentResult {
282    /// Aligned first sequence
283    pub aligned_seq1: String,
284    /// Aligned second sequence
285    pub aligned_seq2: String,
286    /// Alignment score
287    pub score: i32,
288}
289
290/// Smith-Waterman local sequence alignment algorithm
291#[derive(Debug, Clone)]
292pub struct SmithWaterman {
293    /// Score for matching characters
294    match_score: i32,
295    /// Penalty for mismatching characters
296    mismatch_penalty: i32,
297    /// Penalty for gaps
298    gap_penalty: i32,
299}
300
301impl Metaphone {
302    /// Create new Metaphone encoder
303    pub fn new() -> Self {
304        Self { max_length: 6 }
305    }
306
307    /// Create with custom maximum length
308    pub fn with_max_length(_maxlength: usize) -> Self {
309        Self {
310            max_length: _maxlength,
311        }
312    }
313}
314
315impl Default for Metaphone {
316    fn default() -> Self {
317        Self::new()
318    }
319}
320
321impl PhoneticAlgorithm for Metaphone {
322    fn encode(&self, text: &str) -> Result<String> {
323        if text.is_empty() {
324            return Ok(String::new());
325        }
326
327        let text = text.to_uppercase();
328        let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
329
330        if chars.is_empty() {
331            return Ok(String::new());
332        }
333
334        let mut result = String::new();
335        let mut i = 0;
336
337        // Initial transformations
338        if chars.len() >= 2 {
339            match (chars[0], chars[1]) {
340                ('A', 'E') | ('G', 'N') | ('K', 'N') | ('P', 'N') | ('W', 'R') => i = 1,
341                ('W', 'H') => {
342                    result.push('W');
343                    i = 2;
344                }
345                _ => {}
346            }
347        }
348
349        while i < chars.len() && result.len() < self.max_length {
350            let ch = chars[i];
351            let next = chars.get(i + 1).copied();
352            let prev = if i > 0 {
353                chars.get(i - 1).copied()
354            } else {
355                None
356            };
357
358            match ch {
359                'A' | 'E' | 'I' | 'O' | 'U' => {
360                    if i == 0 {
361                        result.push(ch);
362                    }
363                }
364                'B' => {
365                    if !result.ends_with('B') {
366                        result.push('B');
367                    }
368                }
369                'C' => {
370                    if next == Some('H') {
371                        result.push('X');
372                        i += 1;
373                    } else if next == Some('I') || next == Some('E') || next == Some('Y') {
374                        result.push('S');
375                    } else {
376                        result.push('K');
377                    }
378                }
379                'D' => {
380                    if next == Some('G') && chars.get(i + 2) == Some(&'E') {
381                        result.push('J');
382                        i += 2;
383                    } else {
384                        result.push('T');
385                    }
386                }
387                'F' | 'J' | 'L' | 'M' | 'N' | 'R' => {
388                    if !result.ends_with(ch) {
389                        result.push(ch);
390                    }
391                }
392                'G' => {
393                    if next == Some('H') {
394                        i += 1;
395                    } else if next == Some('N') {
396                        result.push('N');
397                        i += 1;
398                    } else {
399                        result.push('K');
400                    }
401                }
402                'H' => {
403                    if prev != Some('C')
404                        && prev != Some('S')
405                        && prev != Some('P')
406                        && prev != Some('T')
407                        && prev != Some('G')
408                    {
409                        result.push('H');
410                    }
411                }
412                'K' => {
413                    if prev != Some('C') {
414                        result.push('K');
415                    }
416                }
417                'P' => {
418                    if next == Some('H') {
419                        result.push('F');
420                        i += 1;
421                    } else {
422                        result.push('P');
423                    }
424                }
425                'Q' => result.push('K'),
426                'S' => {
427                    if next == Some('H') {
428                        result.push('X');
429                        i += 1;
430                    } else if next == Some('I')
431                        && (chars.get(i + 2) == Some(&'A') || chars.get(i + 2) == Some(&'O'))
432                    {
433                        result.push('X');
434                    } else {
435                        result.push('S');
436                    }
437                }
438                'T' => {
439                    if next == Some('H') {
440                        result.push('0');
441                        i += 1;
442                    } else if next != Some('C') || chars.get(i + 2) != Some(&'H') {
443                        result.push('T');
444                    }
445                }
446                'V' => result.push('F'),
447                'W' | 'Y' => {
448                    if next.map(|c| "AEIOU".contains(c)).unwrap_or(false) {
449                        result.push(ch);
450                    }
451                }
452                'X' => {
453                    result.push('K');
454                    result.push('S');
455                }
456                'Z' => result.push('S'),
457                _ => {}
458            }
459
460            i += 1;
461        }
462
463        Ok(result)
464    }
465}
466
467impl Nysiis {
468    /// Create new NYSIIS encoder with no length limit
469    pub fn new() -> Self {
470        Self { max_length: 0 }
471    }
472
473    /// Create with custom maximum length
474    pub fn with_max_length(_maxlength: usize) -> Self {
475        Self {
476            max_length: _maxlength,
477        }
478    }
479}
480
481impl Default for Nysiis {
482    fn default() -> Self {
483        Self::new()
484    }
485}
486
487impl PhoneticAlgorithm for Nysiis {
488    fn encode(&self, text: &str) -> Result<String> {
489        if text.is_empty() {
490            return Ok(String::new());
491        }
492
493        // Convert to uppercase and keep only alphabetic characters
494        let mut word: Vec<char> = text
495            .to_uppercase()
496            .chars()
497            .filter(|c| c.is_alphabetic())
498            .collect();
499
500        if word.is_empty() {
501            return Ok(String::new());
502        }
503
504        // Step 1: Handle initial letter combinations
505        if word.len() >= 3 {
506            let start3 = &word[0..3].iter().collect::<String>();
507            let start2 = &word[0..2].iter().collect::<String>();
508
509            if start3 == "MAC" {
510                word[1] = 'C';
511            } else if start2 == "KN" {
512                word.remove(0);
513            }
514        } else if word.len() >= 2 {
515            let start = &word[0..2].iter().collect::<String>();
516            if start == "KN" {
517                word.remove(0);
518            }
519        }
520
521        // Handle PH, PF at start
522        if word.len() >= 2 && (word[0] == 'P' && (word[1] == 'H' || word[1] == 'F')) {
523            word[0] = 'F';
524            word[1] = 'F';
525        }
526
527        // Handle SCH at start
528        if word.len() >= 3 && word[0] == 'S' && word[1] == 'C' && word[2] == 'H' {
529            word[0] = 'S';
530            word[1] = 'S';
531            word.remove(2); // Remove the third character to get SS instead of SSS
532        }
533
534        // Step 2: Handle last letter combinations
535        let len = word.len();
536        if len >= 2 {
537            let end = &word[len - 2..].iter().collect::<String>();
538            match end.as_str() {
539                "EE" | "IE" => {
540                    word.truncate(len - 2);
541                    word.push('Y');
542                }
543                "DT" | "RT" | "RD" | "NT" | "ND" => {
544                    word.truncate(len - 1);
545                    word.push('D');
546                }
547                _ => {}
548            }
549        }
550
551        // Step 3: First character of key is first character of name
552        let mut result = vec![word[0]];
553
554        // Step 4: Translate remaining characters
555        for i in 1..word.len() {
556            let ch = word[i];
557            let prev = word[i - 1];
558            let next = word.get(i + 1).copied();
559
560            match ch {
561                'A' => {
562                    // Add 'A' only if previous character is different and result doesn't end with 'A'
563                    if prev != 'A' && result.last() != Some(&'A') {
564                        result.push('A');
565                    }
566                }
567                'E' | 'I' | 'O' | 'U' => {
568                    if prev == ch {
569                        continue; // Skip repeated vowels
570                    }
571                    // Convert vowels to 'A', but avoid consecutive 'A's in the result
572                    if result.last() != Some(&'A') {
573                        result.push('A');
574                    }
575                }
576                'Q' => result.push('G'),
577                'Z' => result.push('S'),
578                'M' => result.push('N'),
579                'K' => {
580                    if next == Some('N') {
581                        result.push('N');
582                    } else {
583                        result.push('C');
584                    }
585                }
586                'S' => {
587                    if next == Some('C') && word.get(i + 2) == Some(&'H') {
588                        result.push('S');
589                        result.push('S');
590                        result.push('S');
591                    } else {
592                        result.push('S');
593                    }
594                }
595                'P' => {
596                    if next == Some('H') {
597                        result.push('F');
598                        result.push('F');
599                    } else {
600                        result.push('P');
601                    }
602                }
603                'H' => {
604                    // Skip 'H' if it follows 'G' (for GH combinations)
605                    if prev == 'G' {
606                        // Skip this 'H'
607                    } else if !matches!(prev, 'A' | 'E' | 'I' | 'O' | 'U')
608                        && !matches!(
609                            next,
610                            Some('A') | Some('E') | Some('I') | Some('O') | Some('U')
611                        )
612                        && prev != ch
613                    {
614                        result.push('H');
615                    }
616                }
617                'W' => {
618                    if matches!(prev, 'A' | 'E' | 'I' | 'O' | 'U') && prev != ch {
619                        result.push('W');
620                    }
621                }
622                _ => {
623                    if prev != ch {
624                        // Skip repeated consonants
625                        result.push(ch);
626                    } else if i == 1 && ch == 'F' && result.len() == 1 && result[0] == 'F' {
627                        // Special case: allow FF from PH conversion at start only
628                        result.push(ch);
629                    } else if i == 1 && ch == 'S' && result.len() == 1 && result[0] == 'S' {
630                        // Special case: allow SS from SCH conversion at start only
631                        result.push(ch);
632                    }
633                }
634            }
635        }
636
637        // Step 5: Remove trailing 'S', 'A', and 'H'
638        while result.len() > 1
639            && (result.last() == Some(&'S')
640                || result.last() == Some(&'A')
641                || result.last() == Some(&'H'))
642        {
643            result.pop();
644        }
645
646        // Step 6: Replace 'AY' at end with 'Y'
647        if result.len() >= 2 && result[result.len() - 2] == 'A' && result[result.len() - 1] == 'Y' {
648            result.pop();
649            result.pop();
650            result.push('Y');
651        }
652
653        // Apply max length if specified
654        let mut encoded: String = result.into_iter().collect();
655        if self.max_length > 0 && encoded.len() > self.max_length {
656            encoded.truncate(self.max_length);
657        }
658
659        Ok(encoded)
660    }
661}
662
663impl NeedlemanWunsch {
664    /// Create a new Needleman-Wunsch aligner with default parameters
665    pub fn new() -> Self {
666        Self {
667            match_score: 1,
668            mismatch_penalty: -1,
669            gap_penalty: -1,
670        }
671    }
672
673    /// Create with custom scoring parameters
674    pub fn with_scores(_match_score: i32, mismatch_penalty: i32, gappenalty: i32) -> Self {
675        Self {
676            match_score: _match_score,
677            mismatch_penalty,
678            gap_penalty: gappenalty,
679        }
680    }
681
682    /// Align two sequences using the Needleman-Wunsch algorithm
683    pub fn align(&self, seq1: &str, seq2: &str) -> AlignmentResult {
684        let seq1_chars: Vec<char> = seq1.chars().collect();
685        let seq2_chars: Vec<char> = seq2.chars().collect();
686        let m = seq1_chars.len();
687        let n = seq2_chars.len();
688
689        // Initialize scoring matrix
690        let mut matrix = vec![vec![0; n + 1]; m + 1];
691
692        // Initialize first row and column with gap penalties
693        for (i, item) in matrix.iter_mut().enumerate().take(m + 1) {
694            item[0] = i as i32 * self.gap_penalty;
695        }
696        for j in 0..=n {
697            matrix[0][j] = j as i32 * self.gap_penalty;
698        }
699
700        // Fill the matrix
701        for i in 1..=m {
702            for j in 1..=n {
703                let match_mismatch = if seq1_chars[i - 1] == seq2_chars[j - 1] {
704                    matrix[i - 1][j - 1] + self.match_score
705                } else {
706                    matrix[i - 1][j - 1] + self.mismatch_penalty
707                };
708
709                let delete = matrix[i - 1][j] + self.gap_penalty;
710                let insert = matrix[i][j - 1] + self.gap_penalty;
711
712                matrix[i][j] = *[match_mismatch, delete, insert].iter().max().unwrap();
713            }
714        }
715
716        // Traceback to find the alignment
717        let mut aligned_seq1 = String::new();
718        let mut aligned_seq2 = String::new();
719        let mut i = m;
720        let mut j = n;
721
722        while i > 0 || j > 0 {
723            if i > 0 && j > 0 {
724                let current_score = matrix[i][j];
725                let diagonal_score = if seq1_chars[i - 1] == seq2_chars[j - 1] {
726                    matrix[i - 1][j - 1] + self.match_score
727                } else {
728                    matrix[i - 1][j - 1] + self.mismatch_penalty
729                };
730                let up_score = matrix[i - 1][j] + self.gap_penalty;
731                let left_score = matrix[i][j - 1] + self.gap_penalty;
732
733                // Prioritize diagonal (match/mismatch), then left (insertion), then up (deletion)
734                // This ensures consistent behavior when multiple paths have the same score
735                if current_score == diagonal_score {
736                    aligned_seq1.insert(0, seq1_chars[i - 1]);
737                    aligned_seq2.insert(0, seq2_chars[j - 1]);
738                    i -= 1;
739                    j -= 1;
740                } else if current_score == left_score {
741                    aligned_seq1.insert(0, '-');
742                    aligned_seq2.insert(0, seq2_chars[j - 1]);
743                    j -= 1;
744                } else if current_score == up_score {
745                    aligned_seq1.insert(0, seq1_chars[i - 1]);
746                    aligned_seq2.insert(0, '-');
747                    i -= 1;
748                } else {
749                    // Fallback case - should not happen with correct implementation
750                    aligned_seq1.insert(0, seq1_chars[i - 1]);
751                    aligned_seq2.insert(0, seq2_chars[j - 1]);
752                    i -= 1;
753                    j -= 1;
754                }
755            } else if i > 0 {
756                aligned_seq1.insert(0, seq1_chars[i - 1]);
757                aligned_seq2.insert(0, '-');
758                i -= 1;
759            } else {
760                aligned_seq1.insert(0, '-');
761                aligned_seq2.insert(0, seq2_chars[j - 1]);
762                j -= 1;
763            }
764        }
765
766        AlignmentResult {
767            aligned_seq1,
768            aligned_seq2,
769            score: matrix[m][n],
770        }
771    }
772}
773
774impl Default for NeedlemanWunsch {
775    fn default() -> Self {
776        Self::new()
777    }
778}
779
780impl SmithWaterman {
781    /// Create a new Smith-Waterman aligner with default parameters
782    pub fn new() -> Self {
783        Self {
784            match_score: 2,
785            mismatch_penalty: -1,
786            gap_penalty: -1,
787        }
788    }
789
790    /// Create with custom scoring parameters
791    pub fn with_scores(_match_score: i32, mismatch_penalty: i32, gappenalty: i32) -> Self {
792        Self {
793            match_score: _match_score,
794            mismatch_penalty,
795            gap_penalty: gappenalty,
796        }
797    }
798
799    /// Align two sequences using the Smith-Waterman algorithm
800    pub fn align(&self, seq1: &str, seq2: &str) -> AlignmentResult {
801        let seq1_chars: Vec<char> = seq1.chars().collect();
802        let seq2_chars: Vec<char> = seq2.chars().collect();
803        let m = seq1_chars.len();
804        let n = seq2_chars.len();
805
806        // Initialize scoring matrix
807        let mut matrix = vec![vec![0; n + 1]; m + 1];
808        let mut max_score = 0;
809        let mut max_i = 0;
810        let mut max_j = 0;
811
812        // Fill the matrix
813        for i in 1..=m {
814            for j in 1..=n {
815                let match_mismatch = if seq1_chars[i - 1] == seq2_chars[j - 1] {
816                    matrix[i - 1][j - 1] + self.match_score
817                } else {
818                    matrix[i - 1][j - 1] + self.mismatch_penalty
819                };
820
821                let delete = matrix[i - 1][j] + self.gap_penalty;
822                let insert = matrix[i][j - 1] + self.gap_penalty;
823
824                matrix[i][j] = *[0, match_mismatch, delete, insert].iter().max().unwrap();
825
826                // Track maximum score position
827                if matrix[i][j] > max_score {
828                    max_score = matrix[i][j];
829                    max_i = i;
830                    max_j = j;
831                }
832            }
833        }
834
835        // Traceback from the maximum score position
836        let mut aligned_seq1 = String::new();
837        let mut aligned_seq2 = String::new();
838        let mut i = max_i;
839        let mut j = max_j;
840
841        while i > 0 && j > 0 && matrix[i][j] > 0 {
842            let current_score = matrix[i][j];
843            let diagonal_score = if seq1_chars[i - 1] == seq2_chars[j - 1] {
844                matrix[i - 1][j - 1] + self.match_score
845            } else {
846                matrix[i - 1][j - 1] + self.mismatch_penalty
847            };
848
849            if current_score == diagonal_score {
850                aligned_seq1.insert(0, seq1_chars[i - 1]);
851                aligned_seq2.insert(0, seq2_chars[j - 1]);
852                i -= 1;
853                j -= 1;
854            } else if current_score == matrix[i - 1][j] + self.gap_penalty {
855                aligned_seq1.insert(0, seq1_chars[i - 1]);
856                aligned_seq2.insert(0, '-');
857                i -= 1;
858            } else if current_score == matrix[i][j - 1] + self.gap_penalty {
859                aligned_seq1.insert(0, '-');
860                aligned_seq2.insert(0, seq2_chars[j - 1]);
861                j -= 1;
862            } else {
863                // This shouldn't happen in correct implementation
864                break;
865            }
866        }
867
868        AlignmentResult {
869            aligned_seq1,
870            aligned_seq2,
871            score: max_score,
872        }
873    }
874}
875
876impl Default for SmithWaterman {
877    fn default() -> Self {
878        Self::new()
879    }
880}
881
882#[cfg(test)]
883mod tests {
884    use super::*;
885
886    #[test]
887    fn test_damerau_levenshtein() {
888        let metric = DamerauLevenshteinMetric::new();
889
890        // Basic operations
891        assert_eq!(metric.distance("", "").unwrap(), 0.0);
892        assert_eq!(metric.distance("abc", "").unwrap(), 3.0);
893        assert_eq!(metric.distance("", "abc").unwrap(), 3.0);
894        assert_eq!(metric.distance("abc", "abc").unwrap(), 0.0);
895
896        // Single operations
897        assert_eq!(metric.distance("abc", "aXc").unwrap(), 1.0); // substitution
898        assert_eq!(metric.distance("abc", "ac").unwrap(), 1.0); // deletion
899        assert_eq!(metric.distance("ac", "abc").unwrap(), 1.0); // insertion
900        assert_eq!(metric.distance("abc", "acb").unwrap(), 1.0); // transposition
901
902        // Multiple operations
903        assert_eq!(metric.distance("kitten", "sitting").unwrap(), 3.0);
904
905        // Test normalized distance
906        assert!((metric.normalized_distance("abc", "aXc").unwrap() - 0.333).abs() < 0.01);
907        assert_eq!(metric.normalized_distance("", "").unwrap(), 0.0);
908
909        // Test similarity
910        assert!((metric.similarity("abc", "aXc").unwrap() - 0.667).abs() < 0.01);
911    }
912
913    #[test]
914    fn test_restricted_damerau_levenshtein() {
915        let metric = DamerauLevenshteinMetric::restricted();
916
917        // OSA doesn't allow substring transpositions
918        assert_eq!(metric.distance("ca", "abc").unwrap(), 3.0); // Not 2.0 as in full DL
919    }
920
921    #[test]
922    fn test_soundex() {
923        let soundex = Soundex::new();
924
925        assert_eq!(soundex.encode("Robert").unwrap(), "R163");
926        assert_eq!(soundex.encode("Rupert").unwrap(), "R163");
927        assert_eq!(soundex.encode("SOUNDEX").unwrap(), "S532");
928        assert_eq!(soundex.encode("Smith").unwrap(), "S530");
929        assert_eq!(soundex.encode("Smythe").unwrap(), "S530");
930        assert_eq!(soundex.encode("").unwrap(), "");
931        assert_eq!(soundex.encode("123").unwrap(), "");
932
933        // Test sounds_like
934        assert!(soundex.sounds_like("Robert", "Rupert").unwrap());
935        assert!(soundex.sounds_like("Smith", "Smythe").unwrap());
936        assert!(!soundex.sounds_like("Smith", "Jones").unwrap());
937
938        // Test custom length
939        let soundex_5 = Soundex::with_length(5);
940        assert_eq!(soundex_5.encode("SOUNDEX").unwrap(), "S5320");
941    }
942
943    #[test]
944    fn test_metaphone() {
945        let metaphone = Metaphone::new();
946
947        assert_eq!(metaphone.encode("programming").unwrap(), "PRKRMN");
948        assert_eq!(metaphone.encode("programmer").unwrap(), "PRKRMR");
949        assert_eq!(metaphone.encode("Wright").unwrap(), "RT");
950        assert_eq!(metaphone.encode("White").unwrap(), "WT");
951        assert_eq!(metaphone.encode("Knight").unwrap(), "NT");
952        assert_eq!(metaphone.encode("").unwrap(), "");
953        assert_eq!(metaphone.encode("123").unwrap(), "");
954
955        // Test sounds_like - "Wright" and "Write" actually should sound similar in Metaphone
956        assert!(metaphone.sounds_like("Wright", "Write").unwrap());
957        assert!(!metaphone.sounds_like("White", "Wright").unwrap());
958
959        // Test custom max length
960        let metaphone_3 = Metaphone::with_max_length(3);
961        assert_eq!(metaphone_3.encode("programming").unwrap(), "PRK");
962    }
963
964    #[test]
965    fn test_phonetic_edge_cases() {
966        let soundex = Soundex::new();
967        let metaphone = Metaphone::new();
968
969        // Test with non-alphabetic characters
970        assert_eq!(soundex.encode("O'Brien").unwrap(), "O165");
971        assert_eq!(metaphone.encode("O'Brien").unwrap(), "OBRN");
972
973        // Test with mixed case
974        assert_eq!(soundex.encode("McDonald").unwrap(), "M235");
975        assert_eq!(metaphone.encode("McDonald").unwrap(), "MKTNLT");
976    }
977
978    #[test]
979    fn test_nysiis() {
980        let nysiis = Nysiis::new();
981
982        // Basic tests
983        assert_eq!(nysiis.encode("Johnson").unwrap(), "JANSAN");
984        assert_eq!(nysiis.encode("Williams").unwrap(), "WALAN"); // Standard NYSIIS code
985        assert_eq!(nysiis.encode("Jones").unwrap(), "JAN");
986        assert_eq!(nysiis.encode("Smith").unwrap(), "SNAT");
987        assert_eq!(nysiis.encode("MacDonald").unwrap(), "MCDANALD");
988        assert_eq!(nysiis.encode("Knight").unwrap(), "NAGT");
989        assert_eq!(nysiis.encode("").unwrap(), "");
990        assert_eq!(nysiis.encode("123").unwrap(), "");
991
992        // Test sounds_like
993        assert!(nysiis.sounds_like("Johnson", "Jonson").unwrap());
994        assert!(!nysiis.sounds_like("Smith", "Jones").unwrap());
995
996        // Test edge cases
997        assert_eq!(nysiis.encode("Philips").unwrap(), "FFALAP");
998        assert_eq!(nysiis.encode("Schmidt").unwrap(), "SSNAD");
999        assert_eq!(nysiis.encode("Schneider").unwrap(), "SSNADAR");
1000
1001        // Test with max length
1002        let nysiis_6 = Nysiis::with_max_length(6);
1003        assert_eq!(nysiis_6.encode("Williams").unwrap(), "WALAN"); // 5 chars, so not truncated
1004        assert_eq!(nysiis_6.encode("MacDonald").unwrap(), "MCDANA"); // 6 chars, truncated from longer
1005    }
1006
1007    #[test]
1008    fn test_needleman_wunsch() {
1009        let aligner = NeedlemanWunsch::new();
1010
1011        // Test simple alignment
1012        let result = aligner.align("GATTACA", "GCATGCU");
1013
1014        // Check that we get a valid optimal alignment with correct score
1015        assert_eq!(result.aligned_seq1, "G-ATTACA");
1016        assert_eq!(result.score, 0);
1017
1018        // Verify the alignment is valid by checking that all characters from seq2 are present
1019        let seq2_chars = result
1020            .aligned_seq2
1021            .chars()
1022            .filter(|&c| c != '-')
1023            .collect::<String>();
1024        assert_eq!(seq2_chars, "GCATGCU");
1025
1026        // Verify alignment length consistency
1027        assert_eq!(result.aligned_seq1.len(), result.aligned_seq2.len());
1028
1029        // Test identical sequences
1030        let result = aligner.align("HELLO", "HELLO");
1031        assert_eq!(result.aligned_seq1, "HELLO");
1032        assert_eq!(result.aligned_seq2, "HELLO");
1033        assert_eq!(result.score, 5);
1034
1035        // Test empty sequences
1036        let result = aligner.align("", "ABC");
1037        assert_eq!(result.aligned_seq1, "---");
1038        assert_eq!(result.aligned_seq2, "ABC");
1039        assert_eq!(result.score, -3);
1040
1041        let result = aligner.align("ABC", "");
1042        assert_eq!(result.aligned_seq1, "ABC");
1043        assert_eq!(result.aligned_seq2, "---");
1044        assert_eq!(result.score, -3);
1045
1046        // Test with custom scores
1047        let custom_aligner = NeedlemanWunsch::with_scores(2, -2, -1);
1048        let result = custom_aligner.align("CAT", "CART");
1049        assert!(result.score > 0);
1050    }
1051
1052    #[test]
1053    fn test_smith_waterman() {
1054        let aligner = SmithWaterman::new();
1055
1056        // Test local alignment
1057        let result = aligner.align("GGTTGACTA", "TGTTACGG");
1058        assert!(result.score > 0);
1059        assert!(result.aligned_seq1.contains("GTT"));
1060        assert!(result.aligned_seq2.contains("GTT"));
1061
1062        // Test with sequences having common substring
1063        let result = aligner.align("ABCDEFG", "XYZCDEFPQ");
1064        assert_eq!(result.aligned_seq1, "CDEF");
1065        assert_eq!(result.aligned_seq2, "CDEF");
1066        assert_eq!(result.score, 8); // 4 matches * 2
1067
1068        // Test empty sequences
1069        let result = aligner.align("", "ABC");
1070        assert_eq!(result.aligned_seq1, "");
1071        assert_eq!(result.aligned_seq2, "");
1072        assert_eq!(result.score, 0);
1073
1074        // Test no common subsequence
1075        let result = aligner.align("AAA", "BBB");
1076        assert_eq!(result.score, 0);
1077
1078        // Test with custom scores
1079        let custom_aligner = SmithWaterman::with_scores(3, -3, -2);
1080        let result = custom_aligner.align("ACACACTA", "AGCACACA");
1081        assert!(result.score > 0);
1082    }
1083}