Skip to main content

kondrak_aline/
lib.rs

1#![doc = include_str!("../README.md")]
2
3use std::{collections::HashSet, f64};
4
5use constants::EXTRACTED;
6use unicode_segmentation::UnicodeSegmentation;
7mod constants;
8
9#[cfg(test)]
10mod test;
11
12/// Compute the alignment of two phonetic strings.
13/// 
14/// (Kondrak 2002: 51)
15pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
16    assert!(
17        (0.0..=1.0).contains(&epsilon),
18        "Epsilon must be between 0.0 and 1.0."
19    );
20
21    let str1_chars: Vec<&str> = str1.graphemes(true).collect();
22    let str2_chars: Vec<&str> = str2.graphemes(true).collect();
23    let m = str1_chars.len();
24    let n = str2_chars.len();
25
26    // This includes Kondrak's initialization of row 0 and column 0 to all 0s.
27    let mut s = vec![vec![0.0; n + 1]; m + 1];
28    for i in 1..=m {
29        for j in 1..=n {
30            let edit1 = s[i - 1][j] + sigma_skip();
31            let edit2 = s[i][j - 1] + sigma_skip();
32
33            let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
34
35            let edit4 = if i > 1 {
36                s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
37            } else {
38                -f64::INFINITY
39            };
40
41            let edit5 = if j > 1 {
42                s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
43            } else {
44                -f64::INFINITY
45            };
46
47            s[i][j] = [edit1, edit2, edit3, edit4, edit5]
48                .iter()
49                .fold(0f64, |prev, curr| f64::max(prev, *curr));
50        }
51    }
52
53    let t = (1.0 - epsilon)
54        * s.iter()
55            .flat_map(|row| row.iter())
56            .cloned()
57            .fold(f64::NAN, f64::max);
58
59    let mut aligns = Vec::new();
60    for i in 1..=m {
61        for j in 1..=n {
62            if s[i][j] >= t {
63                let mut out = Vec::new();
64                retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
65                aligns.push(out);
66            }
67        }
68    }
69    aligns
70}
71
72/// Retrieve the path through the similarity matrix S starting at (i, j).
73#[inline]
74fn retrieve<'a>(
75    i: usize,
76    j: usize,
77    score: f64,
78    s: &Vec<Vec<f64>>,
79    t: f64,
80    str1: &[&str],
81    str2: &[&str],
82    out: &'a mut Vec<(String, String)>,
83) -> &'a mut Vec<(String, String)> {
84    if s[i][j] == 0.0 {
85        return out;
86    }
87
88    if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
89        let key = str2[j - 2..j].join("");
90        out.insert(0, (str1[i - 1].to_string(), key));
91
92        retrieve(
93            i - 1,
94            j - 2,
95            score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
96            s,
97            t,
98            str1,
99            str2,
100            out,
101        );
102    } else if i > 1
103        && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
104    {
105        let key = str1[i - 2..i].join("");
106        out.insert(0, (key, str2[j - 1].to_string()));
107
108        retrieve(
109            i - 2,
110            j - 1,
111            score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
112            s,
113            t,
114            str1,
115            str2,
116            out,
117        );
118    } else if (s[i][j - 1] + sigma_skip() + score) >= t {
119        out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
120        retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
121    } else if (s[i - 1][j] + sigma_skip() + score) >= t {
122        out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
123        retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
124    } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
125        out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
126
127        retrieve(
128            i - 1,
129            j - 1,
130            score + sigma_sub(str1[i - 1], str2[j - 1]),
131            s,
132            t,
133            str1,
134            str2,
135            out,
136        );
137    }
138
139    return out;
140}
141
142/// Returns score of an indel of P.
143///
144/// (Kondrak 2002: 54)
145#[inline]
146pub fn sigma_skip() -> f64 {
147    EXTRACTED.cskip
148}
149
150/// Returns score of a substitution of P with Q.
151///
152/// (Kondrak 2002: 54)
153#[inline]
154pub fn sigma_sub(p: &str, q: &str) -> f64 {
155    EXTRACTED.csub - delta(p, q) - v(p) - v(q)
156}
157
158/// Returns score of an expansion/compression.
159///
160/// (Kondrak 2002: 54)
161#[inline]
162pub fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
163    EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
164}
165
166/// Return weighted sum of difference between P and Q.
167///
168/// (Kondrak 2002: 54)
169#[inline]
170pub fn delta(p: &str, q: &str) -> f64 {
171    let features = r(p, q);
172    features
173        .iter()
174        .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
175        .sum()
176}
177
178/// Returns difference between phonetic segments P and Q for feature F.
179///
180/// (Kondrak 2002: 52, 54)
181#[inline]
182pub fn diff(p: &str, q: &str, f: &str) -> f64 {
183    let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
184    let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
185    let p_similarity = *EXTRACTED
186        .similarity_matrix
187        .get(p_features)
188        .unwrap_or_else(|| unreachable!());
189    let q_similarity = *EXTRACTED
190        .similarity_matrix
191        .get(q_features)
192        .unwrap_or_else(|| unreachable!());
193    (p_similarity - q_similarity).abs()
194}
195
196/// Return relevant features for segment comparison.
197///
198/// (Kondrak 2002: 54)
199#[inline]
200pub fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
201    if EXTRACTED.consonants.contains(&p.to_string())
202        || EXTRACTED.consonants.contains(&q.to_string())
203    {
204        &EXTRACTED.rc
205    } else {
206        &EXTRACTED.rv
207    }
208}
209
210/// Return vowel weight if P is vowel.
211///
212/// (Kondrak 2002: 54)
213#[inline]
214pub fn v(p: &str) -> f64 {
215    if !EXTRACTED.consonants.contains(&p.to_string()) {
216        EXTRACTED.cvwl
217    } else {
218        0.0
219    }
220}