harper-core 2.0.0

The language checker for developers.
Documentation
use hashbrown::HashMap;
use serde::{Deserialize, Serialize};
use smallvec::ToSmallVec;

use super::super::word_map::{WordMap, WordMapEntry};
use super::Error;
use super::affix_replacement::AffixReplacement;
use super::expansion::Property;
use super::expansion::{
    AffixEntryKind,
    AffixEntryKind::{Prefix, Suffix},
    Expansion, HumanReadableExpansion,
};
use super::word_list::AnnotatedWord;
use crate::dict_word_metadata_orthography::OrthFlags;
use crate::spell::WordId;
use crate::{CharString, DictWordMetadata, Span};

#[derive(Debug, Clone)]
pub struct AttributeList {
    /// Key = Affix Flag
    affixes: HashMap<char, Expansion>,
    properties: HashMap<char, Property>,
}

impl AttributeList {
    fn into_human_readable(self) -> HumanReadableAttributeList {
        HumanReadableAttributeList {
            affixes: self
                .affixes
                .into_iter()
                .map(|(affix, exp)| (affix, exp.into_human_readable()))
                .collect(),
            properties: self.properties,
        }
    }

    pub fn parse(source: &str) -> Result<Self, Error> {
        let human_readable: Result<HumanReadableAttributeList, _> = serde_json::from_str(source);
        human_readable
            .map_err(Error::from)
            .and_then(|parsed| parsed.into_normal())
    }

    /// Expand an [`AnnotatedWord`] into a list of full words, including itself.
    ///
    /// This function processes a word and its attributes to:
    /// 1. Apply properties to the base word
    /// 2. Generate derived words using affix rules
    /// 3. Handle conditional expansions
    /// 4. Manage cross-product expansions
    ///
    /// # Arguments
    /// * `word` - The word to expand, along with its attributes
    /// * `dest` - The WordMap to store the expanded words and their metadata
    pub fn expand_annotated_word(&self, annotated_word: AnnotatedWord, word_map: &mut WordMap) {
        // Pre-allocate space in the destination map for better performance
        word_map.reserve(annotated_word.annotations.len() + 1);

        // Initialize base metadata that will be applied to all derived forms
        let mut base_metadata = DictWordMetadata::default();

        // Store metadata that should only be applied if certain conditions are met
        let orth_flags = OrthFlags::from_letters(&annotated_word.letters);
        base_metadata.orth_info = orth_flags;

        let mut conditional_expansion_metadata = Vec::new();

        // First pass: Process all properties to build the base metadata
        // Properties directly modify the word's metadata (e.g., part of speech, usage)
        for attr in &annotated_word.annotations {
            let Some(property) = self.properties.get(attr) else {
                continue;
            };
            base_metadata.merge(&property.metadata);
        }

        // Second pass: Process all affix rules to generate derived forms
        for attr in &annotated_word.annotations {
            // Skip if this attribute isn't an affix rule
            let Some(expansion) = self.affixes.get(attr) else {
                continue;
            };

            // Add any base metadata from this affix rule
            base_metadata.merge(&expansion.base_metadata);

            // Track new words generated by this affix rule
            let mut new_words: HashMap<CharString, DictWordMetadata> = HashMap::new();

            // Apply each replacement rule in this affix
            for replacement in &expansion.replacements {
                if let Some(replaced) =
                    Self::apply_replacement(replacement, &annotated_word.letters, expansion.kind)
                {
                    // Get or create metadata for this new word form
                    let metadata = new_words.entry(replaced.clone()).or_default();

                    // Process each target for this replacement
                    for target in &expansion.target {
                        if let Some(condition) = &target.if_base {
                            // Store conditional metadata to be applied later
                            conditional_expansion_metadata.push((
                                replaced.clone(),
                                target.metadata.clone(),
                                condition.clone(),
                            ));
                        } else {
                            // Apply target metadata immediately
                            metadata.merge(&target.metadata);
                        }
                    }
                }
            }

            // Handle cross-product expansions (e.g., both prefix and suffix)
            if expansion.cross_product {
                // Collect attributes that should be applied to the opposite affix type
                let mut opposite_attributes = Vec::new();

                // Add properties that should propagate to derived forms
                for attr in &annotated_word.annotations {
                    let Some(property) = self.properties.get(attr) else {
                        continue;
                    };
                    if expansion.kind == Prefix || property.propagate {
                        opposite_attributes.push(*attr);
                    }
                }

                // Add affix attributes of the opposite type
                for attr in &annotated_word.annotations {
                    let Some(attr_def) = self.affixes.get(attr) else {
                        continue;
                    };
                    // This checks if the current affix is of the opposite type
                    if (attr_def.kind != Prefix) != (expansion.kind != Prefix) {
                        opposite_attributes.push(*attr);
                    }
                }

                // Recursively process each new word form
                for (new_word, metadata) in new_words {
                    self.expand_annotated_word(
                        AnnotatedWord {
                            letters: new_word.clone(),
                            annotations: opposite_attributes.clone(),
                        },
                        word_map,
                    );
                    // Update the metadata of the expanded word
                    let target_metadata = word_map.get_metadata_mut_chars(&new_word).unwrap();
                    target_metadata.merge(&metadata);
                    target_metadata.derived_from =
                        Some(WordId::from_word_chars(&annotated_word.letters));
                }
            } else {
                // Simple case: no cross-product expansion needed
                for (key, mut value) in new_words.into_iter() {
                    value.derived_from = Some(WordId::from_word_chars(&annotated_word.letters));

                    if let Some(existing_metadata) = word_map.get_metadata_mut_chars(&key) {
                        // Merge with existing metadata
                        existing_metadata.merge(&value);
                    } else {
                        // Add new entry
                        word_map.insert(WordMapEntry {
                            canonical_spelling: key,
                            metadata: value,
                        });
                    }
                }
            }
        }

        // Finalize the metadata for the base word
        let mut full_metadata = base_metadata;

        // Merge with any existing metadata for this word
        if let Some(existing_metadata) = word_map.get_with_chars(&annotated_word.letters) {
            full_metadata.merge(&existing_metadata.metadata);
        }

        // Store the final metadata for the base word
        word_map.insert(WordMapEntry {
            metadata: full_metadata.clone(),
            canonical_spelling: annotated_word.letters,
        });

        // Process any conditional expansions
        for (letters, metadata, condition) in conditional_expansion_metadata {
            // Check if the condition is satisfied by the base word's metadata
            let condition_satisfied = full_metadata.or(&condition) == full_metadata;
            if !condition_satisfied {
                continue;
            }

            // Apply the conditional metadata
            word_map
                .get_metadata_mut_chars(&letters)
                .unwrap()
                .merge(&metadata);
        }
    }

    /// Expand an iterator of annotated words into strings.
    /// Note that this does __not__ guarantee that produced words will be
    /// unique.
    pub fn expand_annotated_words(
        &self,
        words: impl IntoIterator<Item = AnnotatedWord>,
        dest: &mut WordMap,
    ) {
        for word in words {
            self.expand_annotated_word(word, dest);
        }
    }

    fn apply_replacement(
        replacement: &AffixReplacement,
        letters: &[char],
        kind: AffixEntryKind,
    ) -> Option<CharString> {
        if replacement.condition.len() > letters.len() {
            return None;
        }

        let target_span = if kind == Suffix {
            Span::new(letters.len() - replacement.condition.len(), letters.len())
        } else {
            Span::new(0, replacement.condition.len())
        };

        let target_segment = target_span.get_content(letters);

        if replacement.condition.matches(target_segment) {
            let mut replaced_segment = letters.to_smallvec();
            let mut remove: CharString = replacement.remove.to_smallvec();

            if kind != Suffix {
                replaced_segment.reverse();
            } else {
                remove.reverse();
            }

            for c in &remove {
                let last = replaced_segment.last()?;

                if last == c {
                    replaced_segment.pop();
                } else {
                    return None;
                }
            }

            let mut to_add = replacement.add.to_vec();

            if kind != Suffix {
                to_add.reverse()
            }

            replaced_segment.extend(to_add);

            if kind != Suffix {
                replaced_segment.reverse();
            }

            return Some(replaced_segment);
        }

        None
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HumanReadableAttributeList {
    affixes: HashMap<char, HumanReadableExpansion>,
    properties: HashMap<char, Property>,
}

impl HumanReadableAttributeList {
    pub fn into_normal(self) -> Result<AttributeList, Error> {
        let mut affixes = HashMap::with_capacity(self.affixes.len());

        for (affix, expansion) in self.affixes.into_iter() {
            affixes.insert(affix, expansion.into_normal()?);
        }

        Ok(AttributeList {
            affixes,
            properties: self.properties,
        })
    }
}

#[cfg(test)]
mod tests {
    use crate::spell::{Dictionary, FstDictionary};

    #[test]
    fn proper_noun_property_propagates_to_plurals() {
        let fst_dict = FstDictionary::curated();
        if let Some(vw_plural) = fst_dict.get_word_metadata_str("Volkswagens") {
            assert!(vw_plural.is_proper_noun());
        }
    }

    #[test]
    fn proper_noun_propagates_to_possessives_2327() {
        if let Some(vw_possessive) = FstDictionary::curated().get_word_metadata_str("Volkswagen's")
        {
            assert!(vw_possessive.is_possessive_noun());
        }
    }
}