canonical/config.rs
1use serde::{Deserialize, Serialize};
2
3/// Configuration for the canonical text pipeline.
4///
5/// `version` is a monotonically increasing schema version for the
6/// canonical layer. Any behavior change that can affect canonical text,
7/// tokenization, or canonical hashes must be accompanied by a new
8/// configuration version.
9#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
10pub struct CanonicalizeConfig {
11 /// Semantic version of the canonicalization configuration.
12 pub version: u32,
13 /// If true, apply Unicode NFKC normalization before other transforms.
14 pub normalize_unicode: bool,
15 /// If true, strip punctuation characters before tokenizing.
16 pub strip_punctuation: bool,
17 /// If true, lowercase the text.
18 pub lowercase: bool,
19}
20
21impl Default for CanonicalizeConfig {
22 fn default() -> Self {
23 Self {
24 version: 1,
25 normalize_unicode: true,
26 strip_punctuation: false,
27 lowercase: true,
28 }
29 }
30}