use phf::{self, phf_map, phf_set};
pub type Set<T> = phf::Set<T>;
pub trait DefinesSentenceEndings {
const SENTENCE_ENDINGS: &'static Set<char> = &phf_set!['.', '?', '!'];
#[inline]
fn is_sentence_ending(c: &char) -> bool {
Self::SENTENCE_ENDINGS.contains(c)
}
}
pub trait DefinesInternalPunctuation {
const INTERNAL_PUNCTUATION: &'static Set<char> = &phf_set![',', ':', ';', '\u{2014}'];
#[inline]
fn is_internal_punctuation(c: &char) -> bool {
Self::INTERNAL_PUNCTUATION.contains(c)
}
}
pub trait DefinesNonWordCharacters {
const NONWORD_CHARS: &'static Set<char> = &phf_set![
'?', '!', ')', '"', ';', '}', ']', '*', ':', '@', '\'', '(', '{', '['
];
#[inline]
fn is_nonword_char(c: &char) -> bool {
Self::NONWORD_CHARS.contains(c)
}
}
pub trait DefinesPunctuation {
const PUNCTUATION: &'static Set<char> = &phf_set![';', ':', ',', '.', '!', '?'];
#[inline]
fn is_punctuation(c: &char) -> bool {
Self::PUNCTUATION.contains(c)
}
}
pub trait DefinesNonPrefixCharacters {
const NONPREFIX_CHARS: &'static Set<char> = &phf_set![
'(', '"', '`', '{', '[', ':', ';', '&', '#', '*', '@', ')', '}', ']', '-', ','
];
#[inline]
fn is_nonprefix_char(c: &char) -> bool {
Self::NONPREFIX_CHARS.contains(c)
}
}
pub trait TrainerParameters: DefinesSentenceEndings + DefinesInternalPunctuation {
const ABBREV_LOWER_BOUND: f64 = 0.3;
const ABBREV_UPPER_BOUND: f64 = 5f64;
const IGNORE_ABBREV_PENALTY: bool = false;
const COLLOCATION_LOWER_BOUND: f64 = 7.88;
const SENTENCE_STARTER_LOWER_BOUND: f64 = 30f64;
const INCLUDE_ALL_COLLOCATIONS: bool = false;
const INCLUDE_ABBREV_COLLOCATIONS: bool = false;
const COLLOCATION_FREQUENCY_LOWER_BOUND: f64 = 1f64;
}
pub struct Standard;
impl DefinesInternalPunctuation for Standard {}
impl DefinesNonPrefixCharacters for Standard {}
impl DefinesNonWordCharacters for Standard {}
impl DefinesPunctuation for Standard {}
impl DefinesSentenceEndings for Standard {}
impl TrainerParameters for Standard {}
pub type OrthographicContext = u8;
#[derive(PartialEq, Eq)]
pub enum OrthographyPosition {
Initial,
Internal,
Unknown,
}
impl OrthographyPosition {
pub fn as_byte(&self) -> u8 {
match *self {
OrthographyPosition::Initial => 0b01000000,
OrthographyPosition::Internal => 0b00100000,
OrthographyPosition::Unknown => 0b01100000,
}
}
}
pub const BEG_UC: OrthographicContext = 0b00000010;
pub const MID_UC: OrthographicContext = 0b00000100;
pub const UNK_UC: OrthographicContext = 0b00001000;
pub const BEG_LC: OrthographicContext = 0b00010000;
pub const MID_LC: OrthographicContext = 0b00100000;
pub const UNK_LC: OrthographicContext = 0b01000000;
pub const ORT_UC: OrthographicContext = BEG_UC | MID_UC | UNK_UC;
pub const ORT_LC: OrthographicContext = BEG_LC | MID_LC | UNK_LC;
pub static ORTHO_MAP: phf::Map<u8, OrthographicContext> = phf_map! {
b'B' => BEG_UC, b'"' => MID_UC, b'b' => UNK_UC, b'A' => BEG_LC, b'!' => MID_LC, b'a' => UNK_LC };
pub enum LetterCase {
Upper,
Lower,
Unknown,
}
impl LetterCase {
#[inline(always)]
pub fn as_byte(&self) -> u8 {
match *self {
LetterCase::Upper => 0b00000010,
LetterCase::Lower => 0b00000001,
LetterCase::Unknown => 0b00000011,
}
}
}