1#![deny(warnings)]
2#[cfg(test)]
4mod tests;
5#[cfg(test)]
6extern crate quickcheck;
7#[cfg(test)]
8extern crate quickcheck_macros;
9extern crate seqdiff;
10extern crate unicode_normalization;
11use seqdiff::Diff;
12use unicode_normalization::UnicodeNormalization;
13
14pub type Alignment = Vec<Vec<usize>>;
15pub type CharMap = Vec<Vec<usize>>;
16
17fn normalize(text: &str) -> String {
18 text.to_lowercase().nfkd().collect()
19}
20
21fn get_char2token<T: AsRef<str>>(tokens: &[T]) -> Vec<usize> {
22 let token_lengths = tokens
23 .iter()
24 .map(|s| s.as_ref().chars().count())
25 .collect::<Vec<_>>();
26 let mut ret = vec![0; token_lengths.iter().sum()];
27 let mut cur = 0;
28 for (i, &l) in token_lengths.iter().enumerate() {
29 for _ in 0..l {
30 ret[cur] = i;
31 cur += 1;
32 }
33 }
34 ret
35}
36
37fn get_alignment(
39 num_tokens: usize,
40 a2b: &[Option<usize>],
41 ac2t: &[usize],
42 bc2t: &[usize],
43) -> Vec<Vec<usize>> {
44 let mut at2bt = vec![vec![]; num_tokens];
45 for (ti, a2bi) in ac2t.iter().zip(a2b) {
46 if let Some(i) = a2bi {
47 if let Some(j) = at2bt[*ti].last() {
48 if *j == bc2t[*i] {
49 continue;
50 }
51 }
52 at2bt[*ti].push(bc2t[*i])
53 }
54 }
55 at2bt
56}
57
58pub fn get_alignments<S: AsRef<str>>(a: &[S], b: &[S]) -> (Alignment, Alignment) {
83 let a: Vec<String> = a.iter().map(|x| normalize(x.as_ref())).collect();
84 let b: Vec<String> = b.iter().map(|x| normalize(x.as_ref())).collect();
85 let ac2t = get_char2token(&a);
86 let bc2t = get_char2token(&b);
87 let (a2b, b2a) = seqdiff::diff(
88 &a.join("").chars().collect::<Vec<_>>(),
89 &b.join("").chars().collect::<Vec<_>>(),
90 );
91 let at2bt = get_alignment(a.len(), &a2b, &ac2t, &bc2t);
92 let bt2at = get_alignment(b.len(), &b2a, &bc2t, &ac2t);
93 (at2bt, bt2at)
94}
95
96pub fn get_charmap(a: &str, b: &str) -> (CharMap, CharMap) {
113 let at: Vec<String> = a.chars().map(|x| x.to_string()).collect();
114 let bt: Vec<String> = b.chars().map(|x| x.to_string()).collect();
115 get_alignments(&at, &bt)
116}
117
118fn _get_charmap(a: &str, b: &str) -> (Diff, Diff) {
121 let at: Vec<String> = a.chars().map(|x| x.to_string()).collect();
122 let bt: Vec<String> = b.chars().map(|x| x.to_string()).collect();
123 let (a2b, b2a) = get_alignments(&at, &bt);
124 let c_a2b: Diff = a2b.into_iter().map(|x| x.into_iter().next()).collect();
125 let c_b2a: Diff = b2a.into_iter().map(|x| x.into_iter().next()).collect();
126 (c_a2b, c_b2a)
127}
128
129fn get_span_indices<S: AsRef<str>>(tokens: &[S]) -> Vec<(usize, usize)> {
130 tokens
131 .iter()
132 .scan(0, |state, token| {
133 let l = *state;
134 let r = l + token.as_ref().chars().count();
135 *state = r;
136 Some((l, r))
137 })
138 .collect()
139}
140
141fn join<S: AsRef<str>>(tokens: &[S]) -> String {
142 let mut text = "".to_owned();
143 for token in tokens.iter() {
144 text.push_str(token.as_ref());
145 }
146 text
147}
148
149#[deprecated(since = "0.5.0", note = "please use `textspan::align_spans` instead")]
150pub fn get_original_spans<S: AsRef<str>>(
151 tokens: &[S],
152 original_text: &str,
153) -> Vec<Option<(usize, usize)>> {
154 let spans = get_span_indices(tokens);
155 let text = join(tokens);
156 let (a2b, b2a) = _get_charmap(&text, original_text);
157
158 let mut ret = vec![];
159 for (l, r) in spans {
160 let mut origl = None;
162 for &x in a2b[l..r].iter() {
163 if x != None {
164 origl = x;
165 break;
166 }
167 }
168 let mut origr = None;
170 for x in a2b[l..r].iter().rev() {
171 if let Some(j) = x {
172 origr = Some(j + 1);
173 break;
174 }
175 }
176 if l == r {
178 if l >= a2b.len() {
179 origl = Some(b2a.len());
180 } else {
181 origl = a2b[l];
182 }
183 origr = origl;
184 }
185 ret.push(match (origl, origr) {
186 (Some(l), Some(r)) => Some((l, r)),
187 (None, None) => None,
188 _ => unreachable!(
189 "Internal error occured in get_original_span\ntokens: {:?}\noriginal_text: {:?}",
190 tokens.iter().map(|x| x.as_ref()).collect::<Vec<_>>(),
191 original_text
192 ),
193 })
194 }
195 ret
196}