use crate::types::*;
use crate::{
filter::{Filter, Filterable},
tokenizer::Tokenizer,
utils,
};
use itertools::Itertools;
use log::{error, info, warn};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::fmt;
pub(crate) mod disambiguation;
pub(crate) mod engine;
pub(crate) mod grammar;
pub mod id;
use engine::Engine;
pub(crate) use engine::composition::{MatchGraph, MatchSentence};
pub use grammar::Example;
use self::{
disambiguation::PosFilter,
engine::{composition::GraphId, EngineMatches},
id::Index,
};
#[derive(Serialize, Deserialize, Debug, Clone)]
pub(crate) struct Unification {
pub(crate) mask: Vec<Option<bool>>,
pub(crate) filters: Vec<Vec<PosFilter>>,
}
impl Unification {
pub fn keep(&self, graph: &MatchGraph, sentence: &MatchSentence) -> bool {
let filters: Vec<_> = self.filters.iter().multi_cartesian_product().collect();
let mut filter_mask: Vec<_> = filters.iter().map(|_| true).collect();
let negate = self.mask.iter().all(|x| x.map_or(true, |x| !x));
for (group, maybe_mask_val) in graph.groups()[1..].iter().zip(self.mask.iter()) {
if maybe_mask_val.is_some() {
for token in group.tokens(sentence) {
for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) {
*mask_val = *mask_val && PosFilter::and(filter, token.word());
}
}
}
}
let result = filter_mask.iter().any(|x| *x);
if negate {
!result
} else {
result
}
}
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DisambiguationRule {
pub(crate) id: Index,
pub(crate) engine: Engine,
pub(crate) disambiguations: disambiguation::Disambiguation,
pub(crate) filter: Option<Filter>,
pub(crate) start: GraphId,
pub(crate) end: GraphId,
pub(crate) examples: Vec<disambiguation::DisambiguationExample>,
pub(crate) unification: Option<Unification>,
}
#[derive(Default)]
pub(crate) struct Changes(Vec<Vec<HashSet<Span>>>);
impl Changes {
fn lshift(self, position: Position) -> Self {
Changes(
self.0
.into_iter()
.map(|spans| {
spans
.into_iter()
.map(|group_spans| {
group_spans
.into_iter()
.map(|span| span.lshift(position))
.collect()
})
.collect()
})
.collect(),
)
}
}
impl Changes {
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl DisambiguationRule {
pub fn id(&self) -> &Index {
&self.id
}
pub(crate) fn apply<'t>(&'t self, sentence: &MatchSentence<'t>) -> Changes {
if matches!(self.disambiguations, disambiguation::Disambiguation::Nop) {
return Changes::default();
}
let mut all_spans = Vec::new();
for graph in self.engine.get_matches(sentence, self.start, self.end) {
if let Some(unification) = &self.unification {
if !unification.keep(&graph, sentence) {
continue;
}
}
if let Some(filter) = &self.filter {
if !filter.keep(sentence, &graph) {
continue;
}
}
let mut spans = Vec::new();
for group_idx in GraphId::range(&self.start, &self.end) {
let group = graph.by_id(group_idx);
let group_spans: HashSet<_> =
group.tokens(sentence).map(|x| x.span().clone()).collect();
spans.push(group_spans);
}
all_spans.push(spans);
}
Changes(all_spans)
}
pub(crate) fn change<'t>(&'t self, sentence: &mut IncompleteSentence<'t>, changes: Changes) {
log::info!("applying {}", self.id);
for spans in changes.0 {
let mut groups = Vec::new();
let mut refs = sentence.iter_mut().collect::<Vec<_>>();
for group_spans in spans {
let mut group = Vec::new();
while let Some(i) = refs.iter().position(|x| group_spans.contains(&x.span())) {
group.push(refs.remove(i));
}
groups.push(group);
}
self.disambiguations.apply(groups);
}
}
pub fn test(&self, tokenizer: &Tokenizer) -> bool {
let mut passes = Vec::new();
for (i, test) in self.examples.iter().enumerate() {
let text = match test {
disambiguation::DisambiguationExample::Unchanged(x) => x.as_str(),
disambiguation::DisambiguationExample::Changed(x) => x.text.as_str(),
};
let sentence_before = tokenizer.disambiguate_up_to_id(
tokenizer
.tokenize(text)
.expect("test text must not be empty"),
Some(&self.id),
);
let shift_delta = Position { byte: 1, char: 1 };
let sentence_before_complete =
sentence_before.clone().rshift(shift_delta).into_sentence();
let changes = self
.apply(&MatchSentence::new(&sentence_before_complete))
.lshift(shift_delta);
let mut sentence_after = sentence_before.clone();
if !changes.is_empty() {
self.change(&mut sentence_after, changes);
}
info!("Tokens: {:#?}", sentence_before);
let pass = match test {
disambiguation::DisambiguationExample::Unchanged(_) => {
sentence_before == sentence_after
}
disambiguation::DisambiguationExample::Changed(change) => {
let _before = sentence_before
.iter()
.find(|x| *x.span().char() == change.char_span)
.unwrap();
let after = sentence_after
.iter()
.find(|x| *x.span().char() == change.char_span)
.unwrap();
let unordered_tags = after
.word()
.tags()
.iter()
.map(|x| x.to_owned_word_data())
.collect::<HashSet<owned::WordData>>();
let unordered_tags: HashSet<_> = unordered_tags.iter().collect();
let unordered_tags_change = change
.after
.tags
.iter()
.collect::<HashSet<&owned::WordData>>();
after.word().as_str() == change.after.text.as_ref_id().as_str()
&& unordered_tags == unordered_tags_change
}
};
if !pass {
let error_str = format!(
"Rule {}: Test \"{:#?}\" failed. Before: {:#?}. After: {:#?}.",
self.id, test, sentence_before, sentence_after,
);
if tokenizer
.lang_options()
.known_failures
.contains(&format!("{}:{}", self.id, i))
{
warn!("{}", error_str)
} else {
error!("{}", error_str)
}
}
passes.push(pass);
}
passes.iter().all(|x| *x)
}
}
pub struct Suggestions<'a, 't> {
rule: &'a Rule,
matches: EngineMatches<'a, 't>,
sentence: &'t MatchSentence<'t>,
}
impl<'a, 't> Iterator for Suggestions<'a, 't> {
type Item = Suggestion;
fn next(&mut self) -> Option<Self::Item> {
let rule = self.rule;
let sentence = self.sentence;
let (start, end) = (self.rule.start, self.rule.end);
self.matches.find_map(|graph| {
if let Some(unification) = &rule.unification {
if !unification.keep(&graph, sentence) {
return None;
}
}
let start_group = graph.by_id(start);
let end_group = graph.by_id(end);
let replacements: Vec<String> = rule
.suggesters
.iter()
.filter_map(|x| x.apply(sentence, &graph, start, end))
.collect();
let start = if replacements
.iter()
.all(|x| utils::no_space_chars().chars().any(|c| x.starts_with(c)))
{
let first_token = graph.groups()[graph.get_index(start)..]
.iter()
.find_map(|x| x.tokens(sentence).next())
.unwrap();
let idx = sentence
.iter()
.position(|x| std::ptr::eq(x, first_token))
.unwrap_or(0);
if idx > 0 {
sentence.index(idx - 1).span().end()
} else {
start_group.span.start()
}
} else {
start_group.span.start()
};
let end = end_group.span.end();
if end < start {
return None;
}
let text_before = sentence.slice(Span::from_positions(start, end));
let replacements: Vec<String> = replacements
.into_iter()
.filter(|suggestion| *suggestion != text_before)
.map(|x| utils::fix_nospace_chars(&x))
.collect();
if !replacements.is_empty() {
Some(Suggestion::new(
rule.id.to_string(),
rule.message
.apply(sentence, &graph, rule.start, rule.end)
.expect("Rules must have a message."),
Span::from_positions(start, end),
replacements,
))
} else {
None
}
})
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Rule {
pub(crate) id: Index,
pub(crate) engine: Engine,
pub(crate) examples: Vec<Example>,
pub(crate) suggesters: Vec<grammar::Synthesizer>,
pub(crate) message: grammar::Synthesizer,
pub(crate) start: GraphId,
pub(crate) end: GraphId,
pub(crate) url: Option<String>,
pub(crate) short: Option<String>,
pub(crate) name: String,
pub(crate) category_name: String,
pub(crate) category_type: Option<String>,
pub(crate) unification: Option<Unification>,
pub(crate) enabled: bool,
}
impl fmt::Display for Rule {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.id)
}
}
impl Rule {
pub fn enable(&mut self) {
self.enabled = true;
}
pub fn disable(&mut self) {
self.enabled = false;
}
pub fn enabled(&self) -> bool {
self.enabled
}
pub fn id(&self) -> &Index {
&self.id
}
pub fn short(&self) -> Option<&str> {
self.short.as_deref()
}
pub fn url(&self) -> Option<&str> {
self.url.as_deref()
}
pub fn examples(&self) -> &[Example] {
&self.examples
}
pub fn name(&self) -> &str {
&self.name
}
pub fn category_name(&self) -> &str {
&self.category_name
}
pub fn category_type(&self) -> Option<&str> {
self.category_type.as_deref()
}
pub(crate) fn apply<'a, 't>(&'a self, sentence: &'t MatchSentence<'t>) -> Suggestions<'a, 't> {
Suggestions {
matches: self.engine.get_matches(sentence, self.start, self.end),
rule: &self,
sentence,
}
}
pub fn test(&self, tokenizer: &Tokenizer) -> bool {
let mut passes = Vec::new();
let shift_delta = Position { byte: 1, char: 1 };
for test in self.examples.iter() {
let sentence = tokenizer
.disambiguate(
tokenizer
.tokenize(&test.text())
.expect("test text must not be empty."),
)
.rshift(shift_delta)
.into_sentence();
info!("Sentence: {:#?}", sentence);
let suggestions: Vec<_> = self
.apply(&MatchSentence::new(&sentence))
.map(|s| s.lshift(shift_delta))
.collect();
let pass = if suggestions.len() > 1 {
false
} else {
match test.suggestion() {
Some(correct_suggestion) => {
suggestions.len() == 1 && correct_suggestion == &suggestions[0]
}
None => suggestions.is_empty(),
}
};
if !pass {
warn!(
"Rule {}: test \"{}\" failed. Expected: {:#?}. Found: {:#?}.",
self.id,
test.text(),
test.suggestion(),
suggestions
);
}
passes.push(pass);
}
passes.iter().all(|x| *x)
}
}