#![allow(missing_docs)]
use crate::Entity;
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)] pub struct Span {
pub start_word: usize,
pub end_word: usize,
pub label_idx: usize,
pub score: f32,
}
#[allow(dead_code)] pub fn decode_spans(
text: &str,
word_offsets: &[(usize, usize)], labels: &[String],
spans: &[Span],
threshold: f32,
) -> Vec<Entity> {
let mut out = Vec::new();
for s in spans {
if s.score < threshold {
continue;
}
if s.start_word > s.end_word
|| s.end_word >= word_offsets.len()
|| s.label_idx >= labels.len()
{
continue;
}
let (byte_start, _) = word_offsets[s.start_word];
let (_, byte_end) = word_offsets[s.end_word];
let (char_start, char_end) = crate::offset::bytes_to_chars(text, byte_start, byte_end);
let surface = &text[byte_start..byte_end];
let etype = crate::schema::map_to_canonical(&labels[s.label_idx], None);
out.push(Entity::new(surface, etype, char_start, char_end, s.score));
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decodes_two_spans_with_char_offsets() {
let text = "Acme Corp in Paris.";
let words = [(0, 4), (5, 9), (10, 12), (13, 18), (18, 19)];
let labels = vec!["organization".into(), "location".into()];
let spans = vec![
Span {
start_word: 0,
end_word: 1,
label_idx: 0,
score: 0.9,
}, Span {
start_word: 3,
end_word: 3,
label_idx: 1,
score: 0.8,
}, Span {
start_word: 0,
end_word: 0,
label_idx: 0,
score: 0.1,
}, ];
let ents = decode_spans(text, &words, &labels, &spans, 0.5);
assert_eq!(ents.len(), 2);
assert_eq!(ents[0].text, "Acme Corp");
assert_eq!(ents[0].start(), 0);
assert_eq!(ents[0].end(), 9);
assert_eq!(ents[1].text, "Paris");
assert_eq!(ents[1].start(), 13);
assert_eq!(ents[1].end(), 18);
}
#[test]
fn decodes_unicode_with_char_offsets() {
let text = "田中 Paris";
let words = [(0, 6), (7, 12)];
let labels = vec!["person".into(), "location".into()];
let spans = vec![
Span {
start_word: 0,
end_word: 0,
label_idx: 0,
score: 0.9,
},
Span {
start_word: 1,
end_word: 1,
label_idx: 1,
score: 0.9,
},
];
let ents = decode_spans(text, &words, &labels, &spans, 0.5);
assert_eq!(ents.len(), 2);
assert_eq!(ents[0].text, "田中");
assert_eq!(ents[0].start(), 0);
assert_eq!(ents[0].end(), 2); assert_eq!(ents[1].start(), 3); assert_eq!(ents[1].end(), 8);
}
#[test]
fn out_of_range_spans_are_dropped() {
let text = "a b";
let words = [(0, 1), (2, 3)];
let labels = vec!["x".into()];
let spans = vec![
Span {
start_word: 0,
end_word: 99,
label_idx: 0,
score: 0.9,
},
Span {
start_word: 0,
end_word: 0,
label_idx: 99,
score: 0.9,
},
Span {
start_word: 1,
end_word: 0,
label_idx: 0,
score: 0.9,
}, ];
let ents = decode_spans(text, &words, &labels, &spans, 0.0);
assert_eq!(ents.len(), 0);
}
}