nlprule 0.6.4 - Docs.rs

use std::{ops::Range, sync::Arc};

use super::{structure, Error};
use crate::{tokenizer::tag::Tagger, types::*};
use crate::{utils, utils::regex::Regex};
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};

pub use structure::{read_disambiguation_rules, read_rules};

use crate::rule::disambiguation::*;
use crate::rule::engine::composition::concrete::*;
use crate::rule::engine::composition::*;
use crate::rule::engine::*;
use crate::rule::grammar::*;
use crate::rule::{id::Index, DisambiguationRule, Rule, Unification};

// this is set arbitrarily at the moment, could be an option
#[inline]
fn max_matches() -> usize {
    20
}

#[derive(Serialize, Deserialize, Debug)]
pub(crate) struct RegexCache {
    cache: DefaultHashMap<u64, Option<DefaultHashSet<WordIdInt>>>,
    // this is compared with the hash of the word store of the tagger
    word_hash: u64,
}

impl RegexCache {
    pub fn new(word_hash: u64) -> Self {
        RegexCache {
            cache: DefaultHashMap::default(),
            word_hash,
        }
    }

    pub fn word_hash(&self) -> &u64 {
        &self.word_hash
    }

    pub(crate) fn get(&self, key: &u64) -> Option<&Option<DefaultHashSet<WordIdInt>>> {
        self.cache.get(key)
    }

    pub(crate) fn insert(&mut self, key: u64, value: Option<DefaultHashSet<WordIdInt>>) {
        self.cache.insert(key, value);
    }
}

pub(crate) struct BuildInfo {
    tagger: Arc<Tagger>,
    regex_cache: RegexCache,
}

impl BuildInfo {
    pub fn new(tagger: Arc<Tagger>, regex_cache: RegexCache) -> Self {
        BuildInfo {
            tagger,
            regex_cache,
        }
    }

    pub fn tagger(&self) -> &Arc<Tagger> {
        &self.tagger
    }

    pub fn mut_regex_cache(&mut self) -> &mut RegexCache {
        &mut self.regex_cache
    }
}

fn parse_match_attribs(
    attribs: impl structure::MatchAttributes,
    text: Option<&str>,
    case_sensitive: bool,
    text_match_idx: Option<usize>,
    info: &mut BuildInfo,
) -> Result<Atom, Error> {
    let mut atoms: Vec<Atom> = Vec::new();

    let case_sensitive = match attribs.case_sensitive().as_deref() {
        Some("yes") => true,
        Some("no") => false,
        None => case_sensitive,
        x => panic!("unknown case_sensitive value {:?}", x),
    };

    let inflected = match attribs.inflected().as_deref() {
        Some("yes") => true,
        Some("no") => false,
        None => false,
        x => panic!("unknown inflected value {:?}", x),
    };

    let is_regex = match attribs.regexp().as_deref() {
        Some("yes") => true,
        None => false,
        x => panic!("unknown regexp value {:?}", x),
    };

    let is_postag_regexp = match attribs.postag_regexp().as_deref() {
        Some("yes") => true,
        None => false,
        x => panic!("unknown postag_regexp value {:?}", x),
    };

    let negate = match attribs.negate().as_deref() {
        Some("yes") => true,
        None => false,
        x => panic!("unknown negate value {:?}", x),
    };

    let negate_pos = match attribs.negate_pos().as_deref() {
        Some("yes") => true,
        None => false,
        x => panic!("unknown negate_pos value {:?}", x),
    };

    let mut inflect_matcher = None;
    let mut pos_matcher = None;

    if text.is_some() || text_match_idx.is_some() {
        let matcher = if is_regex {
            if let Some(text) = text {
                let regex = Regex::from_java_regex(text.trim(), true, case_sensitive);
                Matcher::new_regex(regex?, negate, inflected)
            } else {
                return Err(Error::Unexpected("`text` must be set if regex".into()));
            }
        } else {
            Matcher::new_string(
                text_match_idx.map_or_else(
                    || {
                        either::Left(
                            text.expect("either `text_match_idx` or `text` are set.")
                                .trim()
                                .to_string(),
                        )
                    },
                    // this is validated in Composition::new, otherwise creating a graph id is not valid!
                    |id| either::Right(GraphId(id)),
                ),
                negate,
                case_sensitive,
                inflected,
            )
        };

        if inflected {
            inflect_matcher = Some(matcher);
        } else {
            atoms.push(
                (TextAtom {
                    matcher: TextMatcher::new(matcher, info),
                })
                .into(),
            );
        }
    }

    if let Some(postag) = attribs.postag() {
        let raw_matcher = if is_postag_regexp {
            let regex = Regex::from_java_regex(&postag.trim(), true, true);
            Matcher::new_regex(regex?, negate_pos, true)
        } else {
            Matcher::new_string(
                either::Left(postag.trim().to_string()),
                negate_pos,
                true,
                true,
            )
        };
        pos_matcher = Some(PosMatcher::new(raw_matcher, info));
    }

    if pos_matcher.is_some() || inflect_matcher.is_some() {
        let matcher = WordDataMatcher {
            pos_matcher,
            inflect_matcher: inflect_matcher.map(|x| TextMatcher::new(x, info)),
        };
        atoms.push(
            (WordDataAtom {
                matcher,
                case_sensitive,
            })
            .into(),
        );
    }

    match (attribs.chunk(), attribs.chunk_re()) {
        (Some(chunk), None) => {
            let chunk_atom = ChunkAtom {
                matcher: Matcher::new_string(
                    either::Left(chunk.trim().to_string()),
                    false,
                    true,
                    true,
                ),
            };
            atoms.push(chunk_atom.into());
        }
        (None, Some(chunk_re)) => {
            let regex = Regex::from_java_regex(chunk_re.trim(), true, true)?;
            let chunk_atom = ChunkAtom {
                matcher: Matcher::new_regex(regex, false, true),
            };
            atoms.push(chunk_atom.into());
        }
        (None, None) => {}
        _ => panic!("unexpected combination of chunk / chunk_re values."),
    }

    if let Some(chunk) = attribs.chunk() {
        let chunk_atom = ChunkAtom {
            matcher: Matcher::new_string(either::Left(chunk.trim().to_string()), false, true, true),
        };

        atoms.push(chunk_atom.into());
    }

    if let Some(space_before) = attribs.spacebefore() {
        let value = match space_before.as_str() {
            "yes" => true,
            "no" => false,
            _ => panic!("unknown spacebefore value {}", space_before),
        };

        atoms.push((SpaceBeforeAtom { value }).into());
    }

    Ok(AndAtom::and(atoms))
}

fn get_exceptions(
    token: &structure::Token,
    case_sensitive: bool,
    only_shifted: bool,
    info: &mut BuildInfo,
) -> Result<Atom, Error> {
    if let Some(parts) = &token.parts {
        let exceptions: Vec<Atom> = parts
            .iter()
            .filter_map(|x| match x {
                structure::TokenPart::Exception(x) => Some(x),
                _ => None,
            })
            .filter_map(|x| {
                let exception_text = if let Some(exception_text) = &x.text {
                    Some(exception_text.as_str())
                } else {
                    None
                };
                let mut atom =
                    match parse_match_attribs(x, exception_text, case_sensitive, None, info) {
                        Ok(atom) => atom,
                        Err(err) => return Some(Err(err)),
                    };

                let offset = if let Some(scope) = &x.scope {
                    match scope.as_str() {
                        "next" => 1,
                        "current" => 0,
                        "previous" => -1,
                        _ => panic!("unknown scope value {}", scope),
                    }
                } else {
                    0
                };

                if offset != 0 {
                    atom = OffsetAtom::new(atom, offset).into();
                }

                if !only_shifted || (offset != 0) {
                    Some(Ok(atom))
                } else {
                    None
                }
            })
            .collect::<Result<Vec<_>, Error>>()?;
        Ok(NotAtom::not(OrAtom::or(exceptions)))
    } else {
        Ok((TrueAtom {}).into())
    }
}

fn parse_token(
    token: &structure::Token,
    case_sensitive: bool,
    info: &mut BuildInfo,
) -> Result<Vec<Part>, Error> {
    let mut parts = Vec::new();
    let text = if let Some(parts) = &token.parts {
        parts.iter().find_map(|x| match x {
            structure::TokenPart::Text(text) => Some(text.as_str()),
            _ => None,
        })
    } else {
        None
    };

    let text_match_idx = if let Some(parts) = &token.parts {
        match parts.iter().find_map(|x| match x {
            structure::TokenPart::Sub(sub) => Some(sub.no.parse::<usize>().map(|x| x + 1)),
            _ => None,
        }) {
            None => None,
            Some(Ok(x)) => Some(x),
            Some(Err(err)) => return Err(err.into()),
        }
    } else {
        None
    };

    let min = token
        .min
        .clone()
        .map(|x| {
            if x == "-1" {
                max_matches()
            } else {
                x.parse().expect("can't parse min as usize")
            }
        })
        .unwrap_or(1usize);
    let mut max = token
        .max
        .clone()
        .map(|x| {
            if x == "-1" {
                max_matches()
            } else {
                x.parse().expect("can't parse max as usize")
            }
        })
        .unwrap_or(1usize);
    if min > 1 && max == 1 {
        max = max_matches();
    }

    let quantifier = Quantifier::new(min, max);
    let mut atom = parse_match_attribs(token, text, case_sensitive, text_match_idx, info)?;
    atom = AndAtom::and(vec![
        atom,
        get_exceptions(token, case_sensitive, false, info)?,
    ]);

    parts.push(Part {
        atom,
        quantifier,
        visible: true,
        greedy: true,
        unify: token.unify.as_ref().map(|x| x == "yes"),
    });

    if let Some(to_skip) = token.skip.clone() {
        let to_skip = if to_skip == "-1" {
            max_matches()
        } else {
            to_skip.parse().expect("can't parse skip as usize or -1")
        };

        parts.push(Part {
            atom: get_exceptions(token, case_sensitive, true, info)?,
            quantifier: Quantifier::new(0, to_skip),
            visible: false,
            greedy: false,
            unify: None,
        });
    }

    Ok(parts)
}

fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Result<Match, Error> {
    if m.postag.is_some()
        || m.postag_regex.is_some()
        || m.postag_replace.is_some()
        || m.text.is_some()
    {
        // this would need a fully functional PosReplacer to work
        return Err(Error::Unimplemented(
            "postag, postag_regex, postag_replace and text in `match` are not implemented.".into(),
        ));
    }

    if m.include_skipped.is_some() {
        return Err(Error::Unimplemented(
            "include_skipped in `match` is not implemented.".into(),
        ));
    }

    let id =
        m.no.parse::<usize>()
            .expect("no must be parsable as usize.");

    let case_conversion = if let Some(conversion) = &m.case_conversion {
        Some(conversion.as_str())
    } else {
        None
    };

    let pos_replacer = if let Some(postag) = m.postag {
        if postag.contains("+DT") || postag.contains("+INDT") {
            return Err(Error::Unimplemented(
                "+DT and +INDT determiners are not implemented.".into(),
            ));
        }

        let matcher = match m.postag_regex.as_deref() {
            Some("yes") => {
                let regex = Regex::from_java_regex(&postag, true, false)?;
                Matcher::new_regex(regex, false, true)
            }
            None => Matcher::new_string(either::Left(postag), false, false, true),
            x => panic!("unknown postag_regex value {:?}", x),
        };
        Some(PosReplacer {
            matcher: PosMatcher::new(matcher, info),
        })
    } else {
        None
    };

    let regex_replacer = match (m.regexp_match, m.regexp_replace) {
        (Some(regex_match), Some(regex_replace)) => Some((
            Regex::from_java_regex(&regex_match, false, true)?,
            regex_replace,
        )),
        _ => None,
    };

    Ok(Match {
        id: engine.to_graph_id(id)?,
        conversion: match case_conversion {
            Some("alllower") => Conversion::AllLower,
            Some("startlower") => Conversion::StartLower,
            Some("startupper") => Conversion::StartUpper,
            Some("allupper") => Conversion::AllUpper,
            Some(x) => {
                return Err(Error::Unimplemented(format!(
                    "case conversion {} not supported.",
                    x
                )))
            }
            None => Conversion::Nop,
        },
        pos_replacer,
        regex_replacer,
    })
}

fn parse_synthesizer_text(text: &str, engine: &Engine) -> Result<Vec<SynthesizerPart>, Error> {
    lazy_static! {
        static ref MATCH_REGEX: Regex = Regex::new(r"\\(\d)".into());
    }

    let mut parts = Vec::new();
    let mut end_index = 0;

    for capture in MATCH_REGEX.captures_iter(&text) {
        let mat = capture.get(0).expect("0th regex group exists");

        if end_index != mat.start() {
            parts.push(SynthesizerPart::Text(
                (&text[end_index..mat.start()]).to_string(),
            ))
        }

        let id = capture
            .get(1)
            .expect("1st regex group exists")
            .as_str()
            .parse::<usize>()
            .expect("match regex capture must be parsable as usize.");

        parts.push(SynthesizerPart::Match(
            Match {
                id: engine.to_graph_id(id)?,
                conversion: Conversion::Nop,
                pos_replacer: None,
                regex_replacer: None,
            }
            .into(),
        ));
        end_index = mat.end();
    }

    if end_index < text.len() {
        parts.push(SynthesizerPart::Text((&text[end_index..]).to_string()))
    }
    Ok(parts)
}

fn parse_suggestion(
    data: structure::Suggestion,
    engine: &Engine,
    info: &mut BuildInfo,
) -> Result<Synthesizer, Error> {
    let mut parts = Vec::new();
    for part in data.parts {
        match part {
            structure::SuggestionPart::Text(text) => {
                parts.extend(parse_synthesizer_text(text.as_str(), engine)?);
            }
            structure::SuggestionPart::Match(m) => {
                parts.push(SynthesizerPart::Match(parse_match(m, engine, info)?.into()));
            }
        }
    }

    Ok(Synthesizer {
        parts,
        // use titlecase adjustment (i. e. make replacement title case if match is title case) if token rule
        use_titlecase_adjust: matches!(engine, Engine::Token(_)),
    })
}

fn get_last_id(parts: &[Part]) -> isize {
    parts.iter().fold(1, |a, x| a + x.visible as isize)
}

fn parse_parallel_tokens(
    tokens: &[structure::Token],
    case_sensitive: bool,
    info: &mut BuildInfo,
) -> Result<Vec<Atom>, Error> {
    tokens
        .iter()
        .map(|x| {
            let mut parsed = parse_token(x, case_sensitive, info)?;

            if parsed.len() != 1 || parsed[0].quantifier.min != 1 || parsed[0].quantifier.max != 1 {
                return Err(Error::Unimplemented(
                    "control flow in parallel tokens is not implemented.".into(),
                ));
            }

            Ok(parsed.remove(0).atom)
        })
        .collect()
}

fn parse_tokens(
    tokens: &[structure::TokenCombination],
    case_sensitive: bool,
    info: &mut BuildInfo,
) -> Result<Vec<Part>, Error> {
    let mut out = Vec::new();

    for token_combination in tokens {
        out.extend(match token_combination {
            structure::TokenCombination::Token(token) => parse_token(token, case_sensitive, info)?,
            structure::TokenCombination::And(tokens) => {
                let atom =
                    AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);
                vec![Part {
                    atom,
                    quantifier: Quantifier::new(1, 1),
                    greedy: true,
                    visible: true,
                    unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                }]
            }
            structure::TokenCombination::Or(tokens) => {
                let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);
                vec![Part {
                    atom,
                    quantifier: Quantifier::new(1, 1),
                    greedy: true,
                    visible: true,
                    unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                }]
            }
            structure::TokenCombination::Feature(_) => Vec::new(),
        });
    }

    Ok(out)
}

fn parse_pattern(
    pattern: structure::Pattern,
    info: &mut BuildInfo,
) -> Result<(Composition, usize, usize), Error> {
    let mut start = None;
    let mut end = None;

    let mut composition_parts = Vec::new();
    let case_sensitive = match &pattern.case_sensitive {
        Some(string) => string == "yes",
        None => false,
    };

    for part in &pattern.parts {
        match part {
            structure::PatternPart::Token(token) => {
                composition_parts.extend(parse_token(token, case_sensitive, info)?)
            }
            structure::PatternPart::Marker(marker) => {
                start = Some(get_last_id(&composition_parts));

                composition_parts.extend(parse_tokens(&marker.tokens, case_sensitive, info)?);

                end = Some(get_last_id(&composition_parts));
            }
            structure::PatternPart::And(tokens) => {
                let atom =
                    AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);

                composition_parts.push(Part {
                    atom,
                    quantifier: Quantifier::new(1, 1),
                    greedy: true,
                    visible: true,
                    unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                });
            }
            structure::PatternPart::Or(tokens) => {
                let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);

                composition_parts.push(Part {
                    atom,
                    quantifier: Quantifier::new(1, 1),
                    greedy: true,
                    visible: true,
                    unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                });
            }
            structure::PatternPart::Feature(_) => {}
        }
    }

    let start = start.unwrap_or(1) as usize;
    let end = end.unwrap_or_else(|| get_last_id(&composition_parts)) as usize - 1;

    let composition = Composition::new(composition_parts)?;

    Ok((composition, start, end))
}

fn parse_features(
    pattern: &structure::Pattern,
    unifications: &Option<Vec<structure::Unification>>,
    info: &mut BuildInfo,
) -> Vec<Vec<PosFilter>> {
    let mut filters = Vec::new();
    let mut parse_feature = |id: &str| -> Vec<PosFilter> {
        let unification = unifications
            .as_ref()
            .unwrap()
            .iter()
            .find(|x| x.feature == id)
            .unwrap();

        unification
            .equivalences
            .iter()
            .map(|equiv| {
                parse_pos_filter(
                    &equiv.token.postag,
                    equiv.token.postag_regexp.as_deref(),
                    info,
                )
            })
            .collect()
    };

    for part in &pattern.parts {
        match part {
            structure::PatternPart::Feature(feature) => filters.push(parse_feature(&feature.id)),
            structure::PatternPart::Marker(marker) => {
                for token_combination in &marker.tokens {
                    if let structure::TokenCombination::Feature(feature) = token_combination {
                        filters.push(parse_feature(&feature.id));
                    }
                }
            }
            _ => {}
        }
    }

    filters
}

impl Rule {
    pub(crate) fn from_rule_structure(
        data: structure::Rule,
        info: &mut BuildInfo,
    ) -> Result<Rule, Error> {
        if data.filter.is_some() {
            return Err(Error::Unimplemented(
                "rules with filter are not implemented.".into(),
            ));
        }

        let (engine, start, end) = match (&data.pattern, data.regex) {
            (Some(_), Some(_)) => Err(Error::Unexpected(
                "must not contain both `pattern` and `regexp`.".into(),
            )),
            (None, None) => Err(Error::Unexpected(
                "either `pattern` or `regexp` must be supplied.".into(),
            )),
            (Some(pattern), None) => {
                let (composition, start, end) = parse_pattern(pattern.clone(), info)?;
                let antipatterns = if let Some(antipatterns) = data.antipatterns {
                    antipatterns
                        .into_iter()
                        .map(|pattern| parse_pattern(pattern, info).map(|x| x.0))
                        .collect::<Result<Vec<_>, Error>>()?
                } else {
                    Vec::new()
                };

                if antipatterns
                    .iter()
                    .any(|pattern| pattern.parts.iter().any(|x| x.unify.is_some()))
                {
                    return Err(Error::Unimplemented(
                        "`unify` in antipattern is not supported.".into(),
                    ));
                }

                Ok((
                    Engine::Token(TokenEngine {
                        composition,
                        antipatterns,
                    }),
                    start,
                    end,
                ))
            }
            (None, Some(regex)) => {
                let case_sensitive = match regex.case_sensitive.as_deref() {
                    Some("yes") => true,
                    None => false,
                    x => panic!("unknown case_sensitive value {:?}", x),
                };
                let mark = regex.mark.map_or(Ok(0), |x| x.parse())?;
                let regex = Regex::from_java_regex(&regex.text, false, case_sensitive)?;
                let id_to_idx: DefaultHashMap<GraphId, usize> = (0..regex.captures_len() + 1)
                    .enumerate()
                    // the IDs in a regex rule are just the same as indices
                    .map(|(key, value)| (GraphId(key), value))
                    .collect();
                Ok((Engine::Text(regex.into(), id_to_idx), mark, mark))
            }
        }?;

        let maybe_composition = if let Engine::Token(engine) = &engine {
            Some(&engine.composition)
        } else {
            None
        };

        let unify_data = if let Some(pattern) = &data.pattern {
            let unify_filters = parse_features(&pattern, &data.unifications, info);
            let unify_mask: Vec<_> = maybe_composition
                .unwrap()
                .parts
                .iter()
                .map(|part| part.unify)
                .collect();
            Some((unify_filters, unify_mask))
        } else {
            None
        };

        let mut message_parts = Vec::new();
        let mut suggesters = Vec::new();

        for part in data.message.parts {
            match part {
                structure::MessagePart::Suggestion(suggestion) => {
                    let suggester = parse_suggestion(suggestion.clone(), &engine, info)?;
                    // simpler to just parse a second time than cloning the result
                    message_parts.extend(parse_suggestion(suggestion, &engine, info)?.parts);
                    suggesters.push(suggester);
                }
                structure::MessagePart::Text(text) => {
                    message_parts.extend(parse_synthesizer_text(text.as_str(), &engine)?);
                }
                structure::MessagePart::Match(m) => {
                    message_parts.push(SynthesizerPart::Match(
                        parse_match(m, &engine, info)?.into(),
                    ));
                }
            }
        }

        if let Some(suggestions) = data.suggestions {
            for suggestion in suggestions {
                suggesters.push(parse_suggestion(suggestion, &engine, info)?);
            }
        }

        if suggesters.is_empty() {
            return Err(Error::Unimplemented(
                "rules with no suggestion are not implemented.".into(),
            ));
        }

        assert!(!message_parts.is_empty(), "Rules must have a message.");

        let mut examples = Vec::new();
        for example in &data.examples {
            if example.kind.is_some() {
                return Err(Error::Unimplemented(
                    "examples with `type` (i. e. 'triggers_error') are not implemented.".into(),
                ));
            }

            let mut texts = Vec::new();
            let mut suggestion: Option<Suggestion> = None;

            for part in &example.parts {
                match part {
                    structure::ExamplePart::Text(text) => {
                        texts.push(text.as_str());
                    }
                    structure::ExamplePart::Marker(marker) => {
                        let (bytes_before, chars_before) =
                            texts.iter().fold((0, 0), |acc, text| {
                                (acc.0 + text.len(), acc.1 + text.chars().count())
                            });

                        if suggestion.is_some() {
                            return Err(Error::Unexpected(
                                "example must have one or zero markers".into(),
                            ));
                        }

                        texts.push(marker.text.as_str());

                        if let Some(correction_text) = &example.correction {
                            let mut replacements: Vec<_> =
                                correction_text.split('|').map(|x| x.to_string()).collect();

                            replacements = if chars_before == 0 {
                                // title case if at start
                                replacements
                                    .into_iter()
                                    .map(|x| {
                                        utils::apply_to_first(&x, |c| c.to_uppercase().collect())
                                    })
                                    .collect()
                            } else {
                                replacements
                            };

                            suggestion = Some(Suggestion::new(
                                "_Test".into(),
                                "_Test".into(),
                                Span::new(
                                    bytes_before..bytes_before + marker.text.len(),
                                    chars_before..chars_before + marker.text.chars().count(),
                                ),
                                replacements,
                            ));
                        }
                    }
                }
            }

            examples.push(Example {
                text: texts.join(""),
                suggestion,
            });
        }

        let unification = if let Some((unify_filters, unify_mask)) = unify_data {
            if unify_filters.is_empty() {
                None
            } else {
                Some(Unification {
                    filters: unify_filters,
                    mask: unify_mask,
                })
            }
        } else {
            None
        };

        Ok(Rule {
            start: engine.to_graph_id(start)?,
            end: engine.to_graph_id(end)?,
            engine,
            unification,
            examples,
            suggesters,
            message: Synthesizer {
                parts: message_parts,
                use_titlecase_adjust: true,
            },
            url: data.url.map(|x| x.to_string()),
            short: data.short.map(|x| x.to_string()),
            // fields below need information from rule group / category, so are set later
            id: Index::default(),
            name: String::new(),
            category_name: String::new(),
            category_type: None,
            enabled: true,
        })
    }
}

fn parse_tag_form(form: &str, info: &mut BuildInfo) -> Result<owned::Word, Error> {
    lazy_static! {
        static ref REGEX: Regex = Regex::new(r"(.+?)\[(.+?)\]".into());
    }

    let captures = REGEX
        .captures(form)
        .ok_or_else(|| Error::Unexpected(format!("tag form must match regex, found '{}'", form)))?;
    let text = captures.get(1).expect("1st regex group exists").as_str();
    let tags = captures.get(2).expect("2nd regex group exists").as_str();

    let tags = tags
        .split(',')
        .filter_map(|x| {
            if x == "</S>" {
                // special symbol, presumably for SENT_END, can be ignored
                return None;
            }

            let parts: Vec<_> = x.split('/').collect();
            if parts.len() < 2 {
                None
            } else {
                Some(owned::WordData::new(
                    info.tagger.id_word(parts[0].into()).to_owned_id(),
                    info.tagger.id_tag(parts[1]).to_owned_id(),
                ))
            }
        })
        .collect();

    Ok(owned::Word {
        text: info.tagger.id_word(text.into()).to_owned_id(),
        tags,
    })
}

impl owned::WordData {
    fn from_structure(data: structure::WordData, info: &mut BuildInfo) -> Self {
        owned::WordData::new(
            info.tagger
                .id_word(data.lemma.unwrap_or_else(String::new).into())
                .to_owned_id(),
            info.tagger
                .id_tag(data.pos.as_ref().map_or("", |x| x.as_str().trim()))
                .to_owned_id(),
        )
    }
}

fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> PosFilter {
    match postag_regexp.as_deref() {
        Some("yes") => PosFilter::new(PosMatcher::new(
            Matcher::new_regex(
                Regex::from_java_regex(&postag, true, true).unwrap(),
                false,
                true,
            ),
            info,
        )),
        Some(_) | None => PosFilter::new(PosMatcher::new(
            Matcher::new_string(either::Left(postag.into()), false, false, true),
            info,
        )),
    }
}

impl DisambiguationRule {
    pub(crate) fn from_rule_structure(
        data: structure::DisambiguationRule,
        info: &mut BuildInfo,
    ) -> Result<DisambiguationRule, Error> {
        // might need the pattern later so clone it here
        let (composition, start, end) = parse_pattern(data.pattern.clone(), info)?;

        let unify_filters = parse_features(&data.pattern, &data.unifications, info);
        let unify_mask: Vec<_> = composition.parts.iter().map(|part| part.unify).collect();

        let antipatterns = if let Some(antipatterns) = data.antipatterns {
            antipatterns
                .into_iter()
                .map(|pattern| parse_pattern(pattern, info).map(|x| x.0))
                .collect::<Result<Vec<_>, Error>>()?
        } else {
            Vec::new()
        };

        if antipatterns
            .iter()
            .any(|pattern| pattern.parts.iter().any(|x| x.unify.is_some()))
        {
            return Err(Error::Unimplemented(
                "`unify` in antipattern is not supported.".into(),
            ));
        }

        let engine = Engine::Token(TokenEngine {
            composition,
            antipatterns,
        });

        let word_datas: Vec<_> = if let Some(wds) = data.disambig.word_datas {
            wds.into_iter()
                .map(|part| match part {
                    structure::DisambiguationPart::WordData(x) => {
                        either::Left(owned::WordData::from_structure(x, info))
                    }
                    structure::DisambiguationPart::Match(x) => either::Right(parse_pos_filter(
                        &x.postag.unwrap(),
                        x.postag_regexp.as_deref(),
                        info,
                    )),
                })
                .collect()
        } else {
            Vec::new()
        };

        let disambiguations = match data.disambig.action.as_deref() {
            Some("remove") => {
                if let Some(postag) = data.disambig.postag.as_ref() {
                    Ok(Disambiguation::Remove(vec![either::Right(
                        parse_pos_filter(postag, Some("yes"), info),
                    )]))
                } else {
                    Ok(Disambiguation::Remove(word_datas.into_iter().collect()))
                }
            }
            Some("add") => {
                if data.disambig.postag.is_some() {
                    return Err(Error::Unimplemented(
                        "postag not supported for `add`.".into(),
                    ));
                }

                Ok(Disambiguation::Add(
                    word_datas
                        .into_iter()
                        .map(|x| x.left().expect("match not supported for `add`"))
                        .collect(),
                ))
            }
            Some("replace") => Ok(Disambiguation::Replace(
                word_datas
                    .into_iter()
                    .map(|x| {
                        x.left()
                            .expect("match not supported for `replace` disambiguation")
                    })
                    .collect(),
            )),
            Some("ignore_spelling") => Ok(Disambiguation::Nop), // ignore_spelling can be ignored since we dont check spelling
            Some("immunize") => Ok(Disambiguation::Nop), // immunize can probably not be ignored
            Some("filterall") => {
                let mut disambig = Vec::new();
                let mut marker_disambig = Vec::new();
                let mut has_marker = false;

                for part in &data.pattern.parts {
                    match part {
                        structure::PatternPart::Marker(marker) => {
                            has_marker = true;
                            for token in &marker.tokens {
                                let token = match token {
                                    structure::TokenCombination::Token(token) => token,
                                    structure::TokenCombination::And(tokens)
                                    | structure::TokenCombination::Or(tokens) => &tokens.tokens[0],
                                    structure::TokenCombination::Feature(_) => continue,
                                };

                                marker_disambig.push(token.postag.as_ref().map(|x| {
                                    either::Right(parse_pos_filter(
                                        x,
                                        token.postag_regexp.as_deref(),
                                        info,
                                    ))
                                }));
                            }
                        }
                        structure::PatternPart::Token(token) => {
                            disambig.push(token.postag.as_ref().map(|x| {
                                either::Right(parse_pos_filter(
                                    x,
                                    token.postag_regexp.as_deref(),
                                    info,
                                ))
                            }))
                        }
                        structure::PatternPart::And(tokens)
                        | structure::PatternPart::Or(tokens) => {
                            disambig.push(tokens.tokens[0].postag.as_ref().map(|x| {
                                either::Right(parse_pos_filter(
                                    x,
                                    tokens.tokens[0].postag_regexp.as_deref(),
                                    info,
                                ))
                            }))
                        }
                        structure::PatternPart::Feature(_) => {}
                    }
                }

                let disambiguations = if has_marker {
                    marker_disambig
                } else {
                    disambig
                };

                Ok(Disambiguation::Filter(
                    disambiguations.into_iter().collect(),
                    info.tagger().lang_options().retain_last,
                ))
            }
            Some("filter") => {
                if let Some(postag) = data.disambig.postag.as_ref() {
                    Ok(Disambiguation::Filter(
                        vec![Some(either::Right(parse_pos_filter(
                            postag,
                            Some("yes"),
                            info,
                        )))],
                        info.tagger().lang_options().retain_last,
                    ))
                } else {
                    Ok(Disambiguation::Filter(
                        word_datas.into_iter().map(Some).collect(),
                        info.tagger().lang_options().retain_last,
                    ))
                }
            }
            Some("unify") => {
                let mut mask = Vec::new();
                let mut marker_mask = Vec::new();

                let mut disambig = Vec::new();
                let mut marker_disambig = Vec::new();
                let mut has_marker = false;

                for part in &data.pattern.parts {
                    match part {
                        structure::PatternPart::Marker(marker) => {
                            has_marker = true;
                            for token in &marker.tokens {
                                let token = match token {
                                    structure::TokenCombination::Token(token) => token,
                                    structure::TokenCombination::And(tokens)
                                    | structure::TokenCombination::Or(tokens) => &tokens.tokens[0],
                                    structure::TokenCombination::Feature(_) => continue,
                                };

                                marker_disambig.push(token.postag.as_ref().map(|x| {
                                    parse_pos_filter(x, token.postag_regexp.as_deref(), info)
                                }));
                                marker_mask.push(token.unify.is_some())
                            }
                        }
                        structure::PatternPart::Token(token) => {
                            disambig.push(token.postag.as_ref().map(|x| {
                                parse_pos_filter(x, token.postag_regexp.as_deref(), info)
                            }));
                            mask.push(token.unify.is_some());
                        }
                        structure::PatternPart::And(tokens)
                        | structure::PatternPart::Or(tokens) => {
                            disambig.push(tokens.tokens[0].postag.as_ref().map(|x| {
                                parse_pos_filter(x, tokens.tokens[0].postag_regexp.as_deref(), info)
                            }));
                            mask.push(tokens.tokens[0].unify.is_some());
                        }
                        structure::PatternPart::Feature(_) => {}
                    }
                }

                let (disambig, mask) = if has_marker {
                    (marker_disambig, marker_mask)
                } else {
                    (disambig, mask)
                };

                Ok(Disambiguation::Unify(unify_filters.clone(), disambig, mask))
            }
            None => {
                if let Some(postag) = data.disambig.postag.as_ref() {
                    Ok(Disambiguation::Filter(
                        vec![Some(either::Left(owned::WordData::new(
                            info.tagger.id_word("".into()).to_owned_id(),
                            info.tagger.id_tag(postag).to_owned_id(),
                        )))],
                        info.tagger().lang_options().retain_last,
                    ))
                } else {
                    Ok(Disambiguation::Filter(
                        word_datas.into_iter().map(Some).collect(),
                        info.tagger().lang_options().retain_last,
                    ))
                }
            }
            Some(x) => Err(Error::Unimplemented(format!("action {}", x))),
        }?;

        let filter = if let Some(filter_data) = data.filter {
            let args = filter_data
                .args
                .split(' ')
                .map(|x| {
                    let idx = x.find(':').unwrap();
                    (
                        x[..idx].to_string(),
                        x[(idx + ':'.len_utf8())..].to_string(),
                    )
                })
                .collect();

            Some(super::impls::filters::get_filter(
                filter_data.class.split('.').next_back().unwrap(),
                args,
                &engine,
            )?)
        } else {
            None
        };

        let mut examples = Vec::new();

        if let Some(examples_structure) = data.examples.as_ref() {
            for example in examples_structure {
                let mut texts = Vec::new();
                let mut char_span: Option<Range<usize>> = None;
                let mut char_length = 0;

                for part in &example.parts {
                    match part {
                        structure::ExamplePart::Text(text) => {
                            texts.push(text.as_str());
                            char_length += text.chars().count();
                        }
                        structure::ExamplePart::Marker(marker) => {
                            if char_span.is_some() {
                                return Err(Error::Unexpected(
                                    "example must have one or zero markers".into(),
                                ));
                            }

                            texts.push(marker.text.as_str());
                            let length = marker.text.chars().count();

                            char_span = Some(char_length..char_length + length);
                            char_length += marker.text.chars().count();
                        }
                    }
                }

                let text = texts.join("");

                let test = match example.kind.as_str() {
                    "untouched" => DisambiguationExample::Unchanged(text),
                    "ambiguous" => DisambiguationExample::Changed(DisambiguationChange {
                        text,
                        before: parse_tag_form(
                            example
                                .inputform
                                .as_ref()
                                .expect("must have inputform when ambiguous example"),
                            info,
                        )?,
                        after: parse_tag_form(
                            &example
                                .outputform
                                .as_ref()
                                .expect("must have inputform when ambiguous example"),
                            info,
                        )?,
                        char_span: char_span.expect("must have marker when ambiguous example"),
                    }),
                    x => panic!("unknown disambiguation example type {}", x),
                };

                examples.push(test);
            }
        }

        Ok(DisambiguationRule {
            start: engine.to_graph_id(start)?,
            end: engine.to_graph_id(end)?,
            engine,
            unification: if unify_filters.is_empty() {
                None
            } else {
                Some(Unification {
                    filters: unify_filters,
                    mask: unify_mask,
                })
            },
            filter,
            disambiguations,
            examples,
            id: Index::default(),
        })
    }
}