1#![doc = include_str!("../README.md")]
2
3use std::{collections::HashSet, f64};
4
5use constants::EXTRACTED;
6use unicode_segmentation::UnicodeSegmentation;
7mod constants;
8
9#[cfg(test)]
10mod test;
11
12pub fn align(str1: &str, str2: &str, epsilon: f64) -> Vec<Vec<(String, String)>> {
16 assert!(
17 (0.0..=1.0).contains(&epsilon),
18 "Epsilon must be between 0.0 and 1.0."
19 );
20
21 let str1_chars: Vec<&str> = str1.graphemes(true).collect();
22 let str2_chars: Vec<&str> = str2.graphemes(true).collect();
23 let m = str1_chars.len();
24 let n = str2_chars.len();
25
26 let mut s = vec![vec![0.0; n + 1]; m + 1];
28 for i in 1..=m {
29 for j in 1..=n {
30 let edit1 = s[i - 1][j] + sigma_skip();
31 let edit2 = s[i][j - 1] + sigma_skip();
32
33 let edit3 = s[i - 1][j - 1] + sigma_sub(str1_chars[i - 1], str2_chars[j - 1]);
34
35 let edit4 = if i > 1 {
36 s[i - 2][j - 1] + sigma_exp(str2_chars[j - 1], str1_chars[i - 2], str1_chars[i - 1])
37 } else {
38 -f64::INFINITY
39 };
40
41 let edit5 = if j > 1 {
42 s[i - 1][j - 2] + sigma_exp(str1_chars[i - 1], str2_chars[j - 2], str2_chars[j - 1])
43 } else {
44 -f64::INFINITY
45 };
46
47 s[i][j] = [edit1, edit2, edit3, edit4, edit5]
48 .iter()
49 .fold(0f64, |prev, curr| f64::max(prev, *curr));
50 }
51 }
52
53 let t = (1.0 - epsilon)
54 * s.iter()
55 .flat_map(|row| row.iter())
56 .cloned()
57 .fold(f64::NAN, f64::max);
58
59 let mut aligns = Vec::new();
60 for i in 1..=m {
61 for j in 1..=n {
62 if s[i][j] >= t {
63 let mut out = Vec::new();
64 retrieve(i, j, 0.0, &s, t, &str1_chars, &str2_chars, &mut out);
65 aligns.push(out);
66 }
67 }
68 }
69 aligns
70}
71
72#[inline]
74fn retrieve<'a>(
75 i: usize,
76 j: usize,
77 score: f64,
78 s: &Vec<Vec<f64>>,
79 t: f64,
80 str1: &[&str],
81 str2: &[&str],
82 out: &'a mut Vec<(String, String)>,
83) -> &'a mut Vec<(String, String)> {
84 if s[i][j] == 0.0 {
85 return out;
86 }
87
88 if j > 1 && (s[i - 1][j - 2] + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]) + score) >= t {
89 let key = str2[j - 2..j].join("");
90 out.insert(0, (str1[i - 1].to_string(), key));
91
92 retrieve(
93 i - 1,
94 j - 2,
95 score + sigma_exp(str1[i - 1], str2[j - 2], str2[j - 1]),
96 s,
97 t,
98 str1,
99 str2,
100 out,
101 );
102 } else if i > 1
103 && (s[i - 2][j - 1] + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]) + score) >= t
104 {
105 let key = str1[i - 2..i].join("");
106 out.insert(0, (key, str2[j - 1].to_string()));
107
108 retrieve(
109 i - 2,
110 j - 1,
111 score + sigma_exp(str2[j - 1], str1[i - 2], str1[i - 1]),
112 s,
113 t,
114 str1,
115 str2,
116 out,
117 );
118 } else if (s[i][j - 1] + sigma_skip() + score) >= t {
119 out.insert(0, ("-".to_string(), str2[j - 1].to_string()));
120 retrieve(i, j - 1, score + sigma_skip(), s, t, str1, str2, out);
121 } else if (s[i - 1][j] + sigma_skip() + score) >= t {
122 out.insert(0, (str1[i - 1].to_string(), "-".to_string()));
123 retrieve(i - 1, j, score + sigma_skip(), s, t, str1, str2, out);
124 } else if (s[i - 1][j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + score) >= t {
125 out.insert(0, (str1[i - 1].to_string(), str2[j - 1].to_string()));
126
127 retrieve(
128 i - 1,
129 j - 1,
130 score + sigma_sub(str1[i - 1], str2[j - 1]),
131 s,
132 t,
133 str1,
134 str2,
135 out,
136 );
137 }
138
139 return out;
140}
141
142#[inline]
146pub fn sigma_skip() -> f64 {
147 EXTRACTED.cskip
148}
149
150#[inline]
154pub fn sigma_sub(p: &str, q: &str) -> f64 {
155 EXTRACTED.csub - delta(p, q) - v(p) - v(q)
156}
157
158#[inline]
162pub fn sigma_exp(p: &str, q1: &str, q2: &str) -> f64 {
163 EXTRACTED.cexp - delta(p, q1) - delta(p, q2) - v(p) - f64::max(v(q1), v(q2))
164}
165
166#[inline]
170pub fn delta(p: &str, q: &str) -> f64 {
171 let features = r(p, q);
172 features
173 .iter()
174 .map(|f| diff(p, q, f) * *EXTRACTED.salience.get(f).unwrap_or_else(|| unreachable!()))
175 .sum()
176}
177
178#[inline]
182pub fn diff(p: &str, q: &str, f: &str) -> f64 {
183 let p_features = &EXTRACTED.feature_matrix[&p.to_string()][f];
184 let q_features = &EXTRACTED.feature_matrix[&q.to_string()][f];
185 let p_similarity = *EXTRACTED
186 .similarity_matrix
187 .get(p_features)
188 .unwrap_or_else(|| unreachable!());
189 let q_similarity = *EXTRACTED
190 .similarity_matrix
191 .get(q_features)
192 .unwrap_or_else(|| unreachable!());
193 (p_similarity - q_similarity).abs()
194}
195
196#[inline]
200pub fn r<'a>(p: &str, q: &str) -> &'static HashSet<String> {
201 if EXTRACTED.consonants.contains(&p.to_string())
202 || EXTRACTED.consonants.contains(&q.to_string())
203 {
204 &EXTRACTED.rc
205 } else {
206 &EXTRACTED.rv
207 }
208}
209
210#[inline]
214pub fn v(p: &str) -> f64 {
215 if !EXTRACTED.consonants.contains(&p.to_string()) {
216 EXTRACTED.cvwl
217 } else {
218 0.0
219 }
220}