use crate::tokenizer::tag::Tagger;
pub use crate::tokenizer::tag::{PosId, WordId};
pub(crate) use crate::tokenizer::tag::{PosIdInt, SpecialPos, WordIdInt};
use derivative::Derivative;
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
use std::{
cmp::Ordering,
collections::{hash_map, HashMap, HashSet},
ops::{Add, AddAssign, Range, Sub},
};
pub(crate) type DefaultHashMap<K, V> = HashMap<K, V>;
pub(crate) type DefaultHashSet<T> = HashSet<T>;
pub(crate) type DefaultHasher = hash_map::DefaultHasher;
pub mod owned {
use super::*;
use serde::{Deserialize, Serialize};
#[derive(Debug, Serialize, Deserialize, Hash, Eq, PartialEq, Clone)]
pub struct WordId(pub(crate) String, pub(crate) Option<WordIdInt>);
impl WordId {
pub fn as_ref_id(&self) -> super::WordId {
super::WordId(self.0.as_str().into(), self.1)
}
}
impl AsRef<str> for WordId {
fn as_ref(&self) -> &str {
self.0.as_ref()
}
}
#[derive(Debug, Serialize, Deserialize, Hash, Eq, PartialEq, Clone)]
pub struct PosId(pub(crate) String, pub(crate) PosIdInt);
impl PosId {
pub fn as_ref_id(&self) -> super::PosId {
super::PosId::regular(self.0.as_str(), self.1)
}
}
impl AsRef<str> for PosId {
fn as_ref(&self) -> &str {
self.0.as_ref()
}
}
#[derive(Debug, Serialize, Deserialize, Hash, Eq, PartialEq, Clone)]
#[allow(missing_docs)]
pub struct WordData {
pub lemma: WordId,
pub pos: PosId,
}
impl WordData {
pub fn new(lemma: WordId, pos_id: PosId) -> Self {
WordData { lemma, pos: pos_id }
}
}
#[derive(Debug, Serialize, Deserialize, Hash, Eq, PartialEq, Clone)]
#[allow(missing_docs)]
pub struct Word {
pub text: WordId,
pub tags: Vec<WordData>,
}
#[derive(Debug, Serialize, Deserialize, Hash, Eq, PartialEq, Clone)]
#[allow(missing_docs)]
pub struct Token {
pub word: Word,
pub span: Span,
pub has_space_before: bool,
pub chunks: Vec<String>,
}
}
#[derive(Derivative, Clone)]
#[derivative(Debug, PartialEq)]
pub struct IncompleteSentence<'t> {
text: &'t str,
tokens: Vec<IncompleteToken<'t>>,
#[derivative(Debug = "ignore", PartialEq = "ignore")]
tagger: &'t Tagger,
span: Span,
}
impl<'t> IntoIterator for IncompleteSentence<'t> {
type Item = IncompleteToken<'t>;
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
self.tokens.into_iter()
}
}
#[allow(clippy::clippy::len_without_is_empty)]
impl<'t> IncompleteSentence<'t> {
pub(crate) fn new(tokens: Vec<IncompleteToken<'t>>, text: &'t str, tagger: &'t Tagger) -> Self {
IncompleteSentence {
text,
tokens,
tagger,
span: Span::new(0..text.len(), 0..text.chars().count()),
}
}
pub fn text(&self) -> &'t str {
self.text
}
pub fn iter_mut(&mut self) -> impl DoubleEndedIterator<Item = &mut IncompleteToken<'t>> {
self.tokens.iter_mut()
}
pub fn iter(&self) -> impl DoubleEndedIterator<Item = &IncompleteToken> {
self.tokens.iter()
}
pub fn len(&self) -> usize {
self.tokens.len()
}
pub fn tagger(&self) -> &'t Tagger {
self.tagger
}
pub fn into_sentence(self) -> Sentence<'t> {
let tagger = self.tagger();
Sentence {
text: self.text(),
tagger,
tokens: self
.tokens
.into_iter()
.map(|token| token.into_token())
.collect(),
span: self.span,
}
}
pub fn span(&self) -> &Span {
&self.span
}
pub fn rshift(mut self, position: Position) -> Self {
self.span = self.span.rshift(position);
self.tokens = self
.tokens
.into_iter()
.map(|x| x.rshift(position))
.collect();
self
}
}
#[derive(Derivative, Clone)]
#[derivative(Debug, PartialEq)]
pub struct Sentence<'t> {
text: &'t str,
tokens: Vec<Token<'t>>,
#[derivative(Debug = "ignore", PartialEq = "ignore")]
tagger: &'t Tagger,
span: Span,
}
impl<'t> IntoIterator for Sentence<'t> {
type Item = Token<'t>;
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
self.tokens.into_iter()
}
}
#[allow(clippy::clippy::len_without_is_empty)]
impl<'t> Sentence<'t> {
pub fn tokens(&self) -> &[Token<'t>] {
&self.tokens
}
pub fn iter(&self) -> impl DoubleEndedIterator<Item = &Token> {
self.tokens.iter()
}
pub fn text(&self) -> &'t str {
self.text
}
pub fn len(&self) -> usize {
self.tokens.len()
}
pub fn tagger(&self) -> &'t Tagger {
self.tagger
}
pub fn span(&self) -> &Span {
&self.span
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct WordData<'t> {
lemma: WordId<'t>,
pos: PosId<'t>,
frozen: bool,
}
impl<'t> WordData<'t> {
pub fn lemma(&self) -> &WordId<'t> {
&self.lemma
}
pub fn pos(&self) -> &PosId<'t> {
&self.pos
}
pub fn new(lemma: WordId<'t>, pos: PosId<'t>) -> Self {
WordData {
lemma,
pos,
frozen: false,
}
}
pub fn freeze(&mut self) {
self.frozen = true;
}
pub fn frozen(&self) -> bool {
self.frozen
}
pub fn to_owned_word_data(&self) -> owned::WordData {
owned::WordData {
lemma: self.lemma.to_owned_id(),
pos: self.pos.to_owned_id(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Word<'t> {
text: WordId<'t>,
tags: Vec<WordData<'t>>,
}
impl<'t> Word<'t> {
pub fn new(text: WordId<'t>, tags: Vec<WordData<'t>>) -> Self {
Word { text, tags }
}
pub fn text(&self) -> &WordId<'t> {
&self.text
}
pub fn tags(&self) -> &[WordData<'t>] {
&self.tags
}
pub fn as_str(&'t self) -> &'t str {
self.text.as_str()
}
pub fn clear(&mut self) {
self.retain(|_| false);
}
pub fn retain<F: FnMut(&WordData<'t>) -> bool>(&mut self, mut f: F) {
self.tags.retain(|data| data.frozen() || f(data));
}
pub fn push(&mut self, data: WordData<'t>) {
self.tags.push(data);
}
pub fn to_owned_word(&self) -> owned::Word {
owned::Word {
text: self.text.to_owned_id(),
tags: self.tags.iter().map(|x| x.to_owned_word_data()).collect(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct IncompleteToken<'t> {
word: Word<'t>,
span: Span,
is_sentence_end: bool,
has_space_before: bool,
chunks: Vec<String>,
}
impl<'t> IncompleteToken<'t> {
pub(crate) fn new(
word: Word<'t>,
span: Span,
is_sentence_end: bool,
has_space_before: bool,
chunks: Vec<String>,
) -> Self {
IncompleteToken {
word,
span,
is_sentence_end,
has_space_before,
chunks,
}
}
pub fn into_token(self) -> Token<'t> {
let mut word = self.word.clone();
word.tags.push(WordData::new(
self.word.text.clone(),
PosId::special(SpecialPos::None),
));
if word.tags.iter().all(|x| x.pos.as_str().is_empty()) {
word.tags.push(WordData::new(
self.word.text.clone(),
PosId::special(SpecialPos::Unknown),
));
}
if self.is_sentence_end {
word.tags.push(WordData::new(
self.word.text,
PosId::special(SpecialPos::SentEnd),
));
}
Token {
word,
span: self.span,
has_space_before: self.has_space_before,
chunks: self.chunks,
}
}
pub fn word(&self) -> &Word<'t> {
&self.word
}
#[allow(missing_docs)]
pub fn word_mut(&mut self) -> &mut Word<'t> {
&mut self.word
}
pub fn span(&self) -> &Span {
&self.span
}
pub fn is_sentence_end(&self) -> bool {
self.is_sentence_end
}
#[allow(missing_docs)]
pub fn is_sentence_end_mut(&mut self) -> &mut bool {
&mut self.is_sentence_end
}
pub fn has_space_before(&self) -> bool {
self.has_space_before
}
pub fn chunks(&self) -> &[String] {
&self.chunks
}
#[allow(missing_docs)]
pub fn chunks_mut(&mut self) -> &mut Vec<String> {
&mut self.chunks
}
pub fn rshift(mut self, position: Position) -> Self {
self.span = self.span.rshift(position);
self
}
}
#[derive(Clone, Debug, PartialEq)]
#[allow(missing_docs)]
pub struct Token<'t> {
word: Word<'t>,
span: Span,
has_space_before: bool,
chunks: Vec<String>,
}
lazy_static! {
static ref SENT_START: Token<'static> = {
Token {
word: Word::new(
WordId::empty(),
vec![WordData::new(
WordId::empty(),
PosId::special(SpecialPos::SentStart),
)]
.into_iter()
.collect(),
),
span: Span::default(),
has_space_before: false,
chunks: Vec::new(),
}
};
}
impl<'t> Token<'t> {
pub fn to_owned_token(&self) -> owned::Token {
owned::Token {
word: self.word.to_owned_word(),
span: self.span.clone(),
has_space_before: self.has_space_before,
chunks: self.chunks.clone(),
}
}
pub fn word(&self) -> &Word<'t> {
&self.word
}
pub fn span(&self) -> &Span {
&self.span
}
pub fn has_space_before(&self) -> bool {
self.has_space_before
}
pub fn chunks(&self) -> &[String] {
&self.chunks
}
pub(crate) fn sent_start() -> &'static Token<'static> {
&*SENT_START
}
pub fn rshift(mut self, position: Position) -> Self {
self.span = self.span.rshift(position);
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Default, Serialize, Deserialize)]
pub struct Position {
pub byte: usize,
pub char: usize,
}
impl PartialOrd for Position {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
let byte_order = self.byte.cmp(&other.byte);
if byte_order == self.char.cmp(&other.char) {
Some(byte_order)
} else {
None
}
}
}
impl AddAssign for Position {
fn add_assign(&mut self, other: Self) {
*self = *self + other;
}
}
impl Add for Position {
type Output = Self;
fn add(self, other: Self) -> Self {
Self {
byte: self.byte + other.byte,
char: self.char + other.char,
}
}
}
impl Sub for Position {
type Output = Self;
fn sub(self, other: Self) -> Self {
Self {
byte: self.byte.saturating_sub(other.byte),
char: self.char.saturating_sub(other.char),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default, Serialize, Deserialize)]
pub struct Span {
byte: Range<usize>,
char: Range<usize>,
}
impl Span {
pub fn from_positions(start: Position, end: Position) -> Self {
assert!(
end >= start,
"end position must be greater than or equal to the start."
);
Span {
byte: start.byte..end.byte,
char: start.char..end.char,
}
}
pub fn new(byte: Range<usize>, char: Range<usize>) -> Self {
Span { byte, char }
}
pub fn start(&self) -> Position {
Position {
byte: self.byte.start,
char: self.char.start,
}
}
pub fn end(&self) -> Position {
Position {
byte: self.byte.end,
char: self.char.end,
}
}
pub fn byte(&self) -> &Range<usize> {
&self.byte
}
pub fn char(&self) -> &Range<usize> {
&self.char
}
pub fn is_empty(&self) -> bool {
self.end() == self.start()
}
pub fn len(&self) -> Position {
self.end() - self.start()
}
pub fn set_start(&mut self, start: Position) {
self.byte.start = start.byte;
self.char.start = start.char;
}
pub fn set_end(&mut self, end: Position) {
self.byte.end = end.byte;
self.char.end = end.char;
}
pub fn rshift(mut self, position: Position) -> Self {
self.byte.start += position.byte;
self.byte.end += position.byte;
self.char.start += position.char;
self.char.end += position.char;
self
}
pub fn lshift(mut self, position: Position) -> Self {
self.byte.start = self.byte.start.saturating_sub(position.byte);
self.byte.end = self.byte.end.saturating_sub(position.byte);
self.char.start = self.char.start.saturating_sub(position.char);
self.char.end = self.char.end.saturating_sub(position.char);
self
}
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Suggestion {
source: String,
message: String,
span: Span,
replacements: Vec<String>,
}
impl std::cmp::PartialEq for Suggestion {
fn eq(&self, other: &Suggestion) -> bool {
let a: HashSet<&String> = self.replacements().iter().collect();
let b: HashSet<&String> = other.replacements().iter().collect();
a.intersection(&b).count() > 0 && other.span() == self.span()
}
}
impl Suggestion {
pub(crate) fn new(
source: String,
message: String,
span: Span,
replacements: Vec<String>,
) -> Self {
Suggestion {
source,
message,
span,
replacements,
}
}
pub fn source(&self) -> &str {
self.source.as_str()
}
pub fn message(&self) -> &str {
self.message.as_str()
}
pub fn replacements(&self) -> &[String] {
&self.replacements
}
pub fn span(&self) -> &Span {
&self.span
}
pub fn rshift(mut self, position: Position) -> Self {
self.span = self.span.rshift(position);
self
}
pub fn lshift(mut self, position: Position) -> Self {
self.span = self.span.lshift(position);
self
}
}