Skip to main content

bistun_core/
traits.rs

1// Bistun Linguistic Metadata Service (LMS)
2// Copyright (C) 2026  Francis Xavier Wazeter IV
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17//! # Traits Dictionary & Enumerations
18//! Crate: bistun-core
19//! Ref: [011-LMS-DTO]
20//! Location: `crates/bistun-core/src/traits.rs`
21//!
22//! **Why**: This module defines the shared vocabulary (`TraitKeys` and Enums) used to encapsulate the Typological and Orthographic properties of a locale.
23//! **Impact**: If this module is compromised, the `CapabilityManifest` cannot be constructed, breaking the capability engine and causing downstream services to fail.
24//!
25//! ### Glossary
26//! * **Typology**: The structural properties of a language (e.g., morphology).
27//! * **Orthography**: The mechanical rendering requirements of a script (e.g., directionality).
28
29use serde::{Deserialize, Serialize};
30
31/// The "Golden Set" of trait keys used in the `CapabilityManifest`.
32///
33/// Time: O(1) | Space: O(1)
34///
35/// # Logic Trace (Internal)
36/// 1. Represents standard keys for the DTO `traits` map.
37/// 2. Utilizes `SCREAMING_SNAKE_CASE` serialization to match the DTO standard.
38///
39/// # Examples
40/// ```rust
41/// use crate::bistun_core::traits::TraitKey;
42/// let key = TraitKey::SegmentationStrategy;
43/// assert_eq!(key, TraitKey::SegmentationStrategy);
44/// ```
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
47pub enum TraitKey {
48    // --- Rendering & Orthography ---
49    /// The primary layout direction (e.g., LTR, RTL).
50    PrimaryDirection,
51    /// Indicates if the text naturally contains bidirectional elements.
52    HasBidiElements,
53    /// Indicates if the script requires complex shaping (e.g., Arabic).
54    RequiresShaping,
55    /// Unicode blocks to preload for rendering.
56    UnicodePreloadBlocks,
57
58    // --- Segmentation & Morphology ---
59    /// Strategy used for word and sentence boundary detection.
60    SegmentationStrategy,
61    /// Typological classification of word formation.
62    MorphologyType,
63    /// Plural category logic required for the locale.
64    PluralCategories,
65
66    // --- Cultural Defaults ---
67    /// Default numeric system (e.g., latn, arab).
68    DefaultNumberingSystem,
69    /// Default calendar system (e.g., gregory, islamic).
70    DefaultCalendar,
71}
72
73/// The UI rendering direction derived from Orthographic mechanics.
74///
75/// Time: O(1) | Space: O(1)
76///
77/// # Logic Trace (Internal)
78/// 1. Represents the text layout requirements for a specific script.
79/// 2. Utilizes `UPPERCASE` serialization for cross-system compatibility.
80///
81/// # Examples
82/// ```rust
83/// use crate::bistun_core::traits::Direction;
84/// let dir = Direction::RTL;
85/// assert_eq!(dir, Direction::RTL);
86/// ```
87#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
88pub enum Direction {
89    /// Left-to-Right layout.
90    LTR,
91    /// Right-to-Left layout.
92    RTL,
93    /// Top-to-Bottom layout.
94    TTB,
95    /// Native bidirectional layout.
96    BIDI,
97}
98
99/// The boundary detection logic (Segmentation) required by the script.
100///
101/// Ordered from the lowest complexity to highest to support the High-Water Mark strategy.
102///
103/// Time: O(1) | Space: O(1)
104///
105/// # Logic Trace (Internal)
106/// 1. Ordered explicitly to allow `Ord` trait derivation to rank complexity automatically.
107/// 2. Permits `TraitAggregator` to resolve conflicts seamlessly.
108///
109/// # Examples
110/// ```rust
111/// use crate::bistun_core::traits::SegType;
112/// // Demonstrating High-Water Mark ordinal comparison
113/// assert!(SegType::DICTIONARY > SegType::SPACE);
114/// ```
115#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
116pub enum SegType {
117    /// No segmentation required.
118    NONE,
119    /// Segmentation based on whitespace.
120    SPACE,
121    /// Segmentation based on individual characters/syllables.
122    CHARACTER,
123    /// Dictionary-based complex segmentation.
124    DICTIONARY,
125}
126
127/// The Typological structure of a language's word formation.
128///
129/// Time: O(1) | Space: O(1)
130///
131/// # Logic Trace (Internal)
132/// 1. Maps language identity to execution strategies for NLP operations (e.g., stemming).
133///
134/// # Examples
135/// ```rust
136/// use crate::bistun_core::traits::MorphType;
137/// let morph = MorphType::AGGLUTINATIVE;
138/// assert_eq!(morph, MorphType::AGGLUTINATIVE);
139/// ```
140#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
141pub enum MorphType {
142    /// Words are invariant (e.g., Chinese).
143    ISOLATING,
144    /// Words are formed by stringing together discrete morphemes (e.g., Turkish).
145    AGGLUTINATIVE,
146    /// Morphemes are fused together in complex ways (e.g., Spanish).
147    FUSIONAL,
148    /// Words are formed using root consonants and vowel templates (e.g., Arabic).
149    TEMPLATIC,
150    /// Complex multi-morpheme words acting as entire sentences (e.g., Inuktitut).
151    POLYSYNTHETIC,
152}
153
154// =====================================================================
155// V2.0.0 Rule Engine Directives
156// =====================================================================
157
158/// Represents the standard algorithmic directives for the Rule Synthesis Engine.
159/// Note: this does not actually appear in the JSON, only the inner variants appear.
160/// Ref: [013-LMS-RULE]
161#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
162#[serde(untagged)]
163pub enum LmsRule {
164    /// Transliteration rule directive.
165    Trans(TransRule),
166    /// Pluralization rule directive.
167    Plural(PluralRule),
168    /// Casing rule directive.
169    Casing(CasingRule),
170    /// Normalization rule directive.
171    Norm(NormRule),
172}
173
174/// Directives for transliteration and phonetic rendering strategies.
175#[allow(non_camel_case_types)]
176#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
177pub enum TransRule {
178    /// No transliteration required.
179    NONE,
180    /// Standard romanization transformation.
181    ROMANIZATION,
182    /// Phonetic spelling transformation.
183    PHONETIC,
184    /// ICU4X algorithmic transform capability.
185    ICU_TRANSFORM,
186}
187
188/// Directives for Unicode normalization logic.
189#[allow(non_camel_case_types)]
190#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
191pub enum NormRule {
192    /// Normalization Form C.
193    NFC,
194    /// Normalization Form D.
195    NFD,
196    /// Normalization Form KC.
197    NFKC,
198    /// Normalization Form KD.
199    NFKD,
200}
201
202/// Directives for morphological plural category mapping.
203#[allow(non_camel_case_types)]
204#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
205pub enum PluralRule {
206    /// Only cardinal numbers are supported.
207    CARDINAL_ONLY,
208    /// Ordinal and cardinal numbers are supported.
209    ORDINAL_SUPPORT,
210    /// Multiple plural categories required (few, many, other, etc.).
211    MULTIPLE_CATEGORIES,
212}
213
214/// Directives for typographic casing mechanics.
215#[allow(non_camel_case_types)]
216#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
217pub enum CasingRule {
218    /// Strict case sensitivity.
219    CASE_SENSITIVE,
220    /// Case-insensitive matching.
221    CASE_INSENSITIVE,
222    /// Special Unicode casing rules (e.g., Turkish dotless i).
223    UNICODE_SPECIAL,
224}
225
226#[cfg(test)]
227mod tests {
228    use super::*;
229
230    #[test]
231    fn test_seg_type_high_water_mark_ordering() {
232        // [Logic Trace Mapping]
233        // [STEP 1]: Setup: Instantiate SegType variants via Ord trait checking.
234        // [STEP 2]: Execute: Compare using the derived Ord logic.
235        // [STEP 3]: Assert: Verify DICTIONARY ranks higher than SPACE, etc.
236        assert!(SegType::DICTIONARY > SegType::SPACE);
237        assert!(SegType::CHARACTER > SegType::SPACE);
238        assert!(SegType::SPACE > SegType::NONE);
239    }
240
241    #[test]
242    fn test_trait_key_serialization() {
243        // [Logic Trace Mapping]
244        // [STEP 1]: Setup: Instantiate TraitKeys.
245        // [STEP 2]: Execute: Serialize to JSON strings.
246        // [STEP 3]: Assert: Verify SCREAMING_SNAKE_CASE serialization.
247        let key_dir = TraitKey::PrimaryDirection;
248        let key_num = TraitKey::DefaultNumberingSystem;
249
250        let json_dir = serde_json::to_string(&key_dir).expect("Failed to serialize trait key");
251        let json_num = serde_json::to_string(&key_num).expect("Failed to serialize trait key");
252
253        assert_eq!(json_dir, r#""PRIMARY_DIRECTION""#);
254        assert_eq!(json_num, r#""DEFAULT_NUMBERING_SYSTEM""#);
255    }
256}