#![deny(warnings)]
#[cfg(test)]
mod tests;
#[cfg(test)]
extern crate quickcheck;
#[cfg(test)]
extern crate quickcheck_macros;
extern crate seqdiff;
extern crate unicode_normalization;
use seqdiff::Diff;
use unicode_normalization::UnicodeNormalization;
pub type Alignment = Vec<Vec<usize>>;
pub type CharMap = Vec<Vec<usize>>;
fn normalize(text: &str) -> String {
text.to_lowercase().nfkd().collect()
}
fn get_char2token<T: AsRef<str>>(tokens: &[T]) -> Vec<usize> {
let token_lengths = tokens
.iter()
.map(|s| s.as_ref().chars().count())
.collect::<Vec<_>>();
let mut ret = vec![0; token_lengths.iter().sum()];
let mut cur = 0;
for (i, &l) in token_lengths.iter().enumerate() {
for _ in 0..l {
ret[cur] = i;
cur += 1;
}
}
ret
}
fn get_alignment(
num_tokens: usize,
a2b: &[Option<usize>],
ac2t: &[usize],
bc2t: &[usize],
) -> Vec<Vec<usize>> {
let mut at2bt = vec![vec![]; num_tokens];
for (ti, a2bi) in ac2t.iter().zip(a2b) {
if let Some(i) = a2bi {
if let Some(j) = at2bt[*ti].last() {
if *j == bc2t[*i] {
continue;
}
}
at2bt[*ti].push(bc2t[*i])
}
}
at2bt
}
pub fn get_alignments<S: AsRef<str>>(a: &[S], b: &[S]) -> (Alignment, Alignment) {
let a: Vec<String> = a.iter().map(|x| normalize(x.as_ref())).collect();
let b: Vec<String> = b.iter().map(|x| normalize(x.as_ref())).collect();
let ac2t = get_char2token(&a);
let bc2t = get_char2token(&b);
let (a2b, b2a) = seqdiff::diff(
&a.join("").chars().collect::<Vec<_>>(),
&b.join("").chars().collect::<Vec<_>>(),
);
let at2bt = get_alignment(a.len(), &a2b, &ac2t, &bc2t);
let bt2at = get_alignment(b.len(), &b2a, &bc2t, &ac2t);
(at2bt, bt2at)
}
pub fn get_charmap(a: &str, b: &str) -> (CharMap, CharMap) {
let at: Vec<String> = a.chars().map(|x| x.to_string()).collect();
let bt: Vec<String> = b.chars().map(|x| x.to_string()).collect();
get_alignments(&at, &bt)
}
fn _get_charmap(a: &str, b: &str) -> (Diff, Diff) {
let at: Vec<String> = a.chars().map(|x| x.to_string()).collect();
let bt: Vec<String> = b.chars().map(|x| x.to_string()).collect();
let (a2b, b2a) = get_alignments(&at, &bt);
let c_a2b: Diff = a2b.into_iter().map(|x| x.into_iter().next()).collect();
let c_b2a: Diff = b2a.into_iter().map(|x| x.into_iter().next()).collect();
(c_a2b, c_b2a)
}
fn get_span_indices<S: AsRef<str>>(tokens: &[S]) -> Vec<(usize, usize)> {
tokens
.iter()
.scan(0, |state, token| {
let l = *state;
let r = l + token.as_ref().chars().count();
*state = r;
Some((l, r))
})
.collect()
}
fn join<S: AsRef<str>>(tokens: &[S]) -> String {
let mut text = "".to_owned();
for token in tokens.iter() {
text.push_str(token.as_ref());
}
text
}
#[deprecated(since = "0.5.0", note = "please use `textspan::align_spans` instead")]
pub fn get_original_spans<S: AsRef<str>>(
tokens: &[S],
original_text: &str,
) -> Vec<Option<(usize, usize)>> {
let spans = get_span_indices(tokens);
let text = join(tokens);
let (a2b, b2a) = _get_charmap(&text, original_text);
let mut ret = vec![];
for (l, r) in spans {
let mut origl = None;
for &x in a2b[l..r].iter() {
if x != None {
origl = x;
break;
}
}
let mut origr = None;
for x in a2b[l..r].iter().rev() {
if let Some(j) = x {
origr = Some(j + 1);
break;
}
}
if l == r {
if l >= a2b.len() {
origl = Some(b2a.len());
} else {
origl = a2b[l];
}
origr = origl;
}
ret.push(match (origl, origr) {
(Some(l), Some(r)) => Some((l, r)),
(None, None) => None,
_ => unreachable!(
"Internal error occured in get_original_span\ntokens: {:?}\noriginal_text: {:?}",
tokens.iter().map(|x| x.as_ref()).collect::<Vec<_>>(),
original_text
),
})
}
ret
}