tokenizations/
lib.rs

1#![deny(warnings)]
2//! Tokenizations alignment functions.
3#[cfg(test)]
4mod tests;
5#[cfg(test)]
6extern crate quickcheck;
7#[cfg(test)]
8extern crate quickcheck_macros;
9extern crate seqdiff;
10extern crate unicode_normalization;
11use seqdiff::Diff;
12use unicode_normalization::UnicodeNormalization;
13
14pub type Alignment = Vec<Vec<usize>>;
15pub type CharMap = Vec<Vec<usize>>;
16
17fn normalize(text: &str) -> String {
18    text.to_lowercase().nfkd().collect()
19}
20
21fn get_char2token<T: AsRef<str>>(tokens: &[T]) -> Vec<usize> {
22    let token_lengths = tokens
23        .iter()
24        .map(|s| s.as_ref().chars().count())
25        .collect::<Vec<_>>();
26    let mut ret = vec![0; token_lengths.iter().sum()];
27    let mut cur = 0;
28    for (i, &l) in token_lengths.iter().enumerate() {
29        for _ in 0..l {
30            ret[cur] = i;
31            cur += 1;
32        }
33    }
34    ret
35}
36
37// Returns tokenization alignment from ta to tb.
38fn get_alignment(
39    num_tokens: usize,
40    a2b: &[Option<usize>],
41    ac2t: &[usize],
42    bc2t: &[usize],
43) -> Vec<Vec<usize>> {
44    let mut at2bt = vec![vec![]; num_tokens];
45    for (ti, a2bi) in ac2t.iter().zip(a2b) {
46        if let Some(i) = a2bi {
47            if let Some(j) = at2bt[*ti].last() {
48                if *j == bc2t[*i] {
49                    continue;
50                }
51            }
52            at2bt[*ti].push(bc2t[*i])
53        }
54    }
55    at2bt
56}
57
58/// Returns the tokenizations alignments `a2b` (from `a` to `b`) and `b2a` (from `b` to `a`) based on the shortest edit script (SES).
59///
60/// # Examples
61///
62/// ```
63/// use tokenizations::get_alignments;
64///
65/// let a = vec!["New York"];
66/// let b = vec!["New", "York"];
67/// // calculate the two alignments `a2b` and `b2a` at the same time
68/// let (a2b, b2a) = get_alignments(&a, &b);
69///
70/// // `a2b[i]` is a set that holds indices `j`s of `b` such that `a[i]` corresponds to `b[j]`
71/// assert_eq!(a2b, vec![[0, 1]]);
72/// // `b2a` is the inverse of `a2b`
73/// assert_eq!(b2a, vec![[0], [0]]);
74///
75/// // `get_alignments` can be applied to noisy tokens.
76/// let a = vec!["à", "la", "gorge"];
77/// let b = vec!["a", "la", "gorge"]; // dropped accent
78/// let (a2b, b2a) = get_alignments(&a, &b);
79/// assert_eq!(a2b, vec![[0], [1], [2]]);
80/// assert_eq!(a2b, vec![[0], [1], [2]]);
81/// ```
82pub fn get_alignments<S: AsRef<str>>(a: &[S], b: &[S]) -> (Alignment, Alignment) {
83    let a: Vec<String> = a.iter().map(|x| normalize(x.as_ref())).collect();
84    let b: Vec<String> = b.iter().map(|x| normalize(x.as_ref())).collect();
85    let ac2t = get_char2token(&a);
86    let bc2t = get_char2token(&b);
87    let (a2b, b2a) = seqdiff::diff(
88        &a.join("").chars().collect::<Vec<_>>(),
89        &b.join("").chars().collect::<Vec<_>>(),
90    );
91    let at2bt = get_alignment(a.len(), &a2b, &ac2t, &bc2t);
92    let bt2at = get_alignment(b.len(), &b2a, &bc2t, &ac2t);
93    (at2bt, bt2at)
94}
95
96/// Returns the character mappings `c_a2b` (from `a` to `b`) and `c_b2a` (from `b` to `a`) based on the shortest edit script (SES).
97///
98/// `a` and `b` can be noisy. For example, `bar` and `bår` can be properly compared.
99///
100/// # Examples
101///
102/// Basic usage:
103///
104/// ```
105/// use tokenizations::get_charmap;
106/// let a = "bar";
107/// let b = "bår";
108/// let (c_a2b, c_b2a) = get_charmap(a, b);
109/// assert_eq!(c_a2b, vec![vec![0], vec![1], vec![2]]);
110/// assert_eq!(c_b2a, vec![vec![0], vec![1], vec![2]]);
111/// ```
112pub fn get_charmap(a: &str, b: &str) -> (CharMap, CharMap) {
113    let at: Vec<String> = a.chars().map(|x| x.to_string()).collect();
114    let bt: Vec<String> = b.chars().map(|x| x.to_string()).collect();
115    get_alignments(&at, &bt)
116}
117
118// Deprecated functions:
119
120fn _get_charmap(a: &str, b: &str) -> (Diff, Diff) {
121    let at: Vec<String> = a.chars().map(|x| x.to_string()).collect();
122    let bt: Vec<String> = b.chars().map(|x| x.to_string()).collect();
123    let (a2b, b2a) = get_alignments(&at, &bt);
124    let c_a2b: Diff = a2b.into_iter().map(|x| x.into_iter().next()).collect();
125    let c_b2a: Diff = b2a.into_iter().map(|x| x.into_iter().next()).collect();
126    (c_a2b, c_b2a)
127}
128
129fn get_span_indices<S: AsRef<str>>(tokens: &[S]) -> Vec<(usize, usize)> {
130    tokens
131        .iter()
132        .scan(0, |state, token| {
133            let l = *state;
134            let r = l + token.as_ref().chars().count();
135            *state = r;
136            Some((l, r))
137        })
138        .collect()
139}
140
141fn join<S: AsRef<str>>(tokens: &[S]) -> String {
142    let mut text = "".to_owned();
143    for token in tokens.iter() {
144        text.push_str(token.as_ref());
145    }
146    text
147}
148
149#[deprecated(since = "0.5.0", note = "please use `textspan::align_spans` instead")]
150pub fn get_original_spans<S: AsRef<str>>(
151    tokens: &[S],
152    original_text: &str,
153) -> Vec<Option<(usize, usize)>> {
154    let spans = get_span_indices(tokens);
155    let text = join(tokens);
156    let (a2b, b2a) = _get_charmap(&text, original_text);
157
158    let mut ret = vec![];
159    for (l, r) in spans {
160        // get the leftmost corresponding char
161        let mut origl = None;
162        for &x in a2b[l..r].iter() {
163            if x != None {
164                origl = x;
165                break;
166            }
167        }
168        // get the rightmost corresponding char
169        let mut origr = None;
170        for x in a2b[l..r].iter().rev() {
171            if let Some(j) = x {
172                origr = Some(j + 1);
173                break;
174            }
175        }
176        // edge case: a token with empty string
177        if l == r {
178            if l >= a2b.len() {
179                origl = Some(b2a.len());
180            } else {
181                origl = a2b[l];
182            }
183            origr = origl;
184        }
185        ret.push(match (origl, origr) {
186            (Some(l), Some(r)) => Some((l, r)),
187            (None, None) => None,
188            _ => unreachable!(
189                "Internal error occured in get_original_span\ntokens: {:?}\noriginal_text: {:?}",
190                tokens.iter().map(|x| x.as_ref()).collect::<Vec<_>>(),
191                original_text
192            ),
193        })
194    }
195    ret
196}