Skip to main content

canonical/
config.rs

1use serde::{Deserialize, Serialize};
2
3/// Configuration for the canonical text pipeline.
4///
5/// `version` is a monotonically increasing schema version for the
6/// canonical layer. Any behavior change that can affect canonical text,
7/// tokenization, or canonical hashes must be accompanied by a new
8/// configuration version.
9#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
10pub struct CanonicalizeConfig {
11    /// Semantic version of the canonicalization configuration.
12    pub version: u32,
13    /// If true, apply Unicode NFKC normalization before other transforms.
14    pub normalize_unicode: bool,
15    /// If true, strip punctuation characters before tokenizing.
16    pub strip_punctuation: bool,
17    /// If true, lowercase the text.
18    pub lowercase: bool,
19}
20
21impl Default for CanonicalizeConfig {
22    fn default() -> Self {
23        Self {
24            version: 1,
25            normalize_unicode: true,
26            strip_punctuation: false,
27            lowercase: true,
28        }
29    }
30}