matcher_rs 0.12.1

A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
//! Transformation graph construction and traversal.
//!
//! The "graph" is a flat-array trie where each node represents one single-bit
//! [`ProcessType`] step. [`build_process_type_tree`] constructs the trie from a set of
//! composite bitmasks, merging shared prefixes so that intermediate results are computed
//! once. [`walk_process_tree`] then traverses the array in parent-before-child order,
//! applying each transformation step exactly once per reachable prefix and collecting
//! the resulting [`TextVariant`]s.
//!
//! # Example trie
//!
//! Given the set `{Fanjian, Fanjian|Delete, Fanjian|Delete|Normalize}`, the trie is:
//!
//! ```text
//!   [0] root (None)
//!    └─[1] Fanjian          ← terminates: {Fanjian}
//!       └─[2] Delete        ← terminates: {Fanjian|Delete}
//!          └─[3] Normalize  ← terminates: {Fanjian|Delete|Normalize}
//! ```
//!
//! Walking this trie for input `"妳!好A"` computes Fanjian once, then Delete on the
//! Fanjian output, then Normalize on the Delete output — reusing each intermediate.

use std::borrow::Cow;
use std::collections::HashSet;

use tinyvec::TinyVec;

use crate::process::process_type::ProcessType;
use crate::process::registry::get_transform_step;
use crate::process::step::TransformStep;
use crate::process::variant::{
    ProcessedTextMasks, TRANSFORM_STATE, TextVariant, return_string_to_pool,
};

/// A node in the flat-array transformation trie used by [`walk_process_tree`].
///
/// The trie is built once by [`build_process_type_tree`] and stored inside
/// [`SimpleMatcher`](crate::SimpleMatcher). Each node represents a single transformation
/// step (`process_type_bit`) reachable from its parent. Nodes form a tree:
///
/// - **Root** (index 0) always has `process_type_bit = ProcessType::None` and no `step`.
/// - **Inner / leaf nodes** each carry a cached `&'static TransformStep` so the hot
///   traversal loop avoids a registry lookup.
/// - **`process_type_list`** records which composite [`ProcessType`] values terminate at
///   this node, so the traversal can tag output text variants with the correct bitmask.
/// - **`pt_index_mask`** is the pre-computed OR of `1u64 << pt.bits()` for all entries in
///   `process_type_list`, avoiding a per-call fold in the hot loop.
///
/// All fields are `pub(crate)` or private; users obtain instances exclusively through
/// [`build_process_type_tree`].
///
/// # Examples
///
/// ```rust
/// use std::collections::HashSet;
/// use matcher_rs::{ProcessType, build_process_type_tree};
///
/// let types = HashSet::from([ProcessType::None, ProcessType::Fanjian]);
/// let tree = build_process_type_tree(&types);
///
/// // Root node is always at index 0; at least one child for Fanjian.
/// assert!(tree.len() >= 2);
/// ```
#[derive(Clone)]
pub struct ProcessTypeBitNode {
    /// Composite [`ProcessType`] values whose decomposed bit-path terminates at this node.
    ///
    /// A non-empty list means that one or more rules emit a text variant here. For example,
    /// if both `Fanjian` and `Fanjian|Delete` are in the set, the Fanjian node's list
    /// contains `Fanjian`, and the Delete child's list contains `Fanjian|Delete`.
    process_type_list: TinyVec<[ProcessType; 4]>,
    /// The single-bit [`ProcessType`] step this node represents (the edge label from parent).
    ///
    /// For the root node this is [`ProcessType::None`].
    pub(crate) process_type_bit: ProcessType,
    /// Flat-array indices of child nodes (the next transformation steps reachable from here).
    pub(crate) children: TinyVec<[usize; 4]>,
    /// Cached reference to the compiled transform step for this node's `process_type_bit`.
    ///
    /// [`None`] only for the root node. All other nodes cache their step at construction
    /// time to avoid a registry lookup in the hot [`walk_process_tree`] loop.
    pub(crate) step: Option<&'static TransformStep>,
    /// Pre-computed OR of `1u64 << pt.bits()` for every `pt` in `process_type_list`.
    ///
    /// Avoids a per-traversal fold in the hot [`walk_process_tree`] loop. After
    /// [`recompute_mask_with_index`](Self::recompute_mask_with_index) is called, the
    /// encoding switches from raw bit positions to sequential indices.
    pub(crate) pt_index_mask: u64,
}

/// Post-construction helpers for [`ProcessTypeBitNode`].
impl ProcessTypeBitNode {
    /// Re-encodes [`pt_index_mask`](Self::pt_index_mask) using a sequential index table.
    ///
    /// The default encoding stores `1u64 << pt.bits()`, which can scatter bits across the
    /// full `u64` range for composite [`ProcessType`] values. A sequential index keeps bit
    /// positions small (`0..N` where `N` is the number of unique composite types) so that
    /// downstream data structures (e.g. `PatternEntry`) can store the index as a `u8`
    /// rather than a `u64`, halving entry size.
    ///
    /// `pt_index_table[pt.bits()]` must contain the sequential index for every composite
    /// [`ProcessType`] that terminates at any node (i.e. every type in the original
    /// `process_type_set` plus [`ProcessType::None`]).
    ///
    /// Called by [`SimpleMatcher::new()`](crate::SimpleMatcher) after
    /// [`build_process_type_tree`] returns.
    pub(crate) fn recompute_mask_with_index(&mut self, pt_index_table: &[u8; 64]) {
        self.pt_index_mask = self.process_type_list.iter().fold(0u64, |acc, pt| {
            acc | (1u64 << pt_index_table[pt.bits() as usize])
        });
    }
}

/// Builds a flat-array trie from a set of composite [`ProcessType`] bitmasks.
///
/// The trie encodes every unique prefix path among the given composite types. A root node
/// with `process_type_bit = ProcessType::None` is always present at index 0. For each
/// composite type (e.g. `Fanjian | Delete`), the constructor walks its constituent bits in
/// ascending order, reusing existing child nodes where the path overlaps with previously
/// inserted types and creating new child nodes when a path diverges.
///
/// The resulting flat `Vec<ProcessTypeBitNode>` is passed to [`walk_process_tree`], which
/// scans the node array in parent-before-child order to compute all required text variants
/// while reusing common prefixes.
///
/// # Examples
///
/// ```rust
/// use std::collections::HashSet;
/// use matcher_rs::{ProcessType, build_process_type_tree};
///
/// let types = HashSet::from([
///     ProcessType::None,
///     ProcessType::Fanjian,
///     ProcessType::Fanjian | ProcessType::Delete,
/// ]);
/// let tree = build_process_type_tree(&types);
///
/// // Root (None) + Fanjian node + Delete node = at least 3 nodes.
/// assert!(tree.len() >= 3);
/// ```
pub fn build_process_type_tree(process_type_set: &HashSet<ProcessType>) -> Vec<ProcessTypeBitNode> {
    let max_nodes: usize = 1 + process_type_set
        .iter()
        .map(|pt| pt.bits().count_ones() as usize)
        .sum::<usize>();
    let mut process_type_tree = Vec::with_capacity(max_nodes);
    let mut root = ProcessTypeBitNode {
        process_type_list: TinyVec::new(),
        process_type_bit: ProcessType::None,
        children: TinyVec::new(),
        step: None,
        pt_index_mask: 0,
    };
    if process_type_set.contains(&ProcessType::None) {
        root.process_type_list.push(ProcessType::None);
        root.pt_index_mask |= 1u64 << ProcessType::None.bits();
    }
    process_type_tree.push(root);
    for &process_type in process_type_set.iter() {
        let mut current_node_index = 0;
        for process_type_bit in process_type.iter() {
            let current_node = &process_type_tree[current_node_index];
            if current_node.process_type_bit == process_type_bit {
                continue;
            }

            let found_child = current_node
                .children
                .iter()
                .find(|&&idx| process_type_tree[idx].process_type_bit == process_type_bit)
                .copied();

            if let Some(child_idx) = found_child {
                current_node_index = child_idx;
                process_type_tree[current_node_index]
                    .process_type_list
                    .push(process_type);
                process_type_tree[current_node_index].pt_index_mask |= 1u64 << process_type.bits();
            } else {
                let mut child = ProcessTypeBitNode {
                    process_type_list: TinyVec::new(),
                    process_type_bit,
                    children: TinyVec::new(),
                    step: Some(get_transform_step(process_type_bit)),
                    pt_index_mask: 0,
                };
                child.process_type_list.push(process_type);
                child.pt_index_mask |= 1u64 << process_type.bits();
                process_type_tree.push(child);
                let new_node_index = process_type_tree.len() - 1;
                process_type_tree[current_node_index]
                    .children
                    .push(new_node_index);
                current_node_index = new_node_index;
            }
        }
    }
    process_type_tree
}

/// Inserts a transformed text into `text_masks`, deduplicating against existing entries.
///
/// If `changed` is `Some(text)` and an equal string already exists in `text_masks`, the
/// duplicate is returned to the thread-local pool and the existing index is returned.
/// Otherwise the new text is appended and the new index is returned. If `changed` is
/// [`None`] (the step was a no-op), `current_index` is returned unchanged.
///
/// `is_ascii` indicates whether the transformed text consists entirely of ASCII bytes;
/// it is stored alongside the text so callers can skip the charwise automaton without a
/// redundant byte scan.
///
/// This deduplication keeps [`walk_process_tree`] in "unique string" space even when
/// different trie paths converge onto the same transformed text (e.g. when Fanjian and
/// Normalize both produce the same output for a particular input).
fn dedup_insert(
    text_masks: &mut ProcessedTextMasks<'_>,
    current_index: usize,
    changed: Option<String>,
    is_ascii: bool,
) -> usize {
    match changed {
        Some(processed) => {
            let plen = processed.len();
            if let Some(pos) = text_masks
                .iter()
                .position(|tv| tv.text.len() == plen && tv.text.as_ref() == processed.as_str())
            {
                return_string_to_pool(processed);
                pos
            } else {
                text_masks.push(TextVariant {
                    text: Cow::Owned(processed),
                    mask: 0u64,
                    is_ascii,
                });
                text_masks.len() - 1
            }
        }
        None => current_index,
    }
}

/// Walks the transformation trie, producing all text variants needed for matching.
///
/// This is the hot-path function called on every [`SimpleMatcher::is_match`](crate::SimpleMatcher::is_match) /
/// [`SimpleMatcher::process`](crate::SimpleMatcher::process) invocation. It performs one
/// forward pass over the flat tree built by [`build_process_type_tree`], relying on the
/// invariant that every parent node appears before its children (guaranteed by construction).
/// Common prefixes (for example the shared `Fanjian` step in both `Fanjian | Delete` and
/// `Fanjian | Normalize`) are computed once and their result indices are reused for child
/// nodes.
///
/// Each [`TextVariant`] in the returned [`ProcessedTextMasks`] carries the transformed
/// text, the bitmask of [`ProcessType`] indices that produced it, and an `is_ascii` flag.
/// When different trie paths converge to the same string, they share a single entry with
/// a merged mask.
///
/// # Const-generic `LAZY` parameter
///
/// - **`LAZY = false`** — Produces all variants without callbacks; `on_variant` is never
///   called. Use this when you need the complete set of variants (e.g. for
///   [`SimpleMatcher::process`](crate::SimpleMatcher::process)).
///
/// - **`LAZY = true`** — Calls `on_variant(text, index, mask, is_ascii)` as soon as each
///   new unique variant is produced. If `on_variant` returns `true`, the walk stops early.
///   A "delta phase" at the end replays any additional mask bits that were merged into an
///   already-seen text through deduplication. Use this for
///   [`SimpleMatcher::is_match`](crate::SimpleMatcher::is_match) to short-circuit as soon
///   as a rule is satisfied.
///
/// # Return value
///
/// Returns `(text_masks, stopped)` where `stopped` is `true` only when `LAZY = true` and
/// `on_variant` triggered early exit.
///
/// Inside `matcher_rs`, owned strings in the returned vector are recycled via
/// `return_processed_string_to_pool`.
/// External callers can simply drop the vector — no manual cleanup is needed.
///
/// # Safety
///
/// Accesses `TRANSFORM_STATE` through `UnsafeCell::get()`.
/// Safe because `#[thread_local]` guarantees single-threaded access and this function is
/// never called re-entrantly.
///
/// Contains a `transmute` from `ProcessedTextMasks<'static>` to `ProcessedTextMasks<'a>`
/// when recycling a pooled buffer. This is sound because the pooled buffer is always empty
/// (drained before being returned to the pool), so no `Cow` borrows with the wrong lifetime
/// exist, and `Vec` layout is lifetime-independent.
///
/// # Panics
///
/// Panics (via `.expect()`) if a non-root node has `step = None`, which indicates a
/// construction bug in [`build_process_type_tree`].
///
/// # Examples
///
/// ```rust
/// use std::collections::HashSet;
/// use matcher_rs::{ProcessType, build_process_type_tree, walk_process_tree};
///
/// let process_types = HashSet::from([
///     ProcessType::None,
///     ProcessType::Fanjian,
///     ProcessType::Delete,
///     ProcessType::Fanjian | ProcessType::Delete,
/// ]);
/// let tree = build_process_type_tree(&process_types);
///
/// // LAZY=false: produce all variants without early stopping.
/// let (variants, stopped) = walk_process_tree::<false, _>(
///     &tree, "妳!好", &mut |_, _, _, _| false,
/// );
/// assert!(!stopped);
///
/// let texts: std::collections::HashSet<_> = variants
///     .into_iter()
///     .map(|tv| tv.text.into_owned())
///     .collect();
///
/// assert_eq!(texts.len(), 4);
/// assert!(texts.contains("妳!好"));  // raw (None)
/// assert!(texts.contains("你!好"));  // Fanjian
/// assert!(texts.contains("妳好"));    // Delete
/// assert!(texts.contains("你好"));    // Fanjian | Delete
/// ```
#[inline(always)]
pub fn walk_process_tree<'a, const LAZY: bool, F>(
    process_type_tree: &[ProcessTypeBitNode],
    text: &'a str,
    on_variant: &mut F,
) -> (ProcessedTextMasks<'a>, bool)
where
    F: FnMut(&str, usize, u64, bool) -> bool,
{
    {
        // SAFETY: #[thread_local] guarantees single-threaded access.
        // walk_process_tree is never called re-entrantly.
        let ts = unsafe { &mut *TRANSFORM_STATE.get() };

        let pooled: Option<ProcessedTextMasks<'static>> = ts.masks_pool.pop();
        // Safety: pool holds empty Vecs with no live borrows; transmuting from
        // 'static to 'a is safe since 'static: 'a (covariant) and Vec is empty.
        let mut text_masks: ProcessedTextMasks<'a> =
            unsafe { std::mem::transmute(pooled.unwrap_or_default()) };
        text_masks.clear();
        let root_is_ascii = text.is_ascii();
        text_masks.push(TextVariant {
            text: Cow::Borrowed(text),
            mask: process_type_tree[0].pt_index_mask,
            is_ascii: root_is_ascii,
        });

        let mut scanned_masks: TinyVec<[u64; 8]> = TinyVec::new();
        if LAZY {
            scanned_masks.push(0u64);
            let root_mask = process_type_tree[0].pt_index_mask;
            if root_mask != 0 && on_variant(text, 0, root_mask, root_is_ascii) {
                return (text_masks, true);
            }
            scanned_masks[0] = root_mask;
        }

        if process_type_tree[0].children.is_empty() {
            return (text_masks, false);
        }

        ts.tree_node_indices.clear();
        ts.tree_node_indices.resize(process_type_tree.len(), 0);

        let mut stopped = false;

        'walk: for (current_node_index, current_node) in process_type_tree.iter().enumerate() {
            let current_index = ts.tree_node_indices[current_node_index];
            let parent_is_ascii = text_masks[current_index].is_ascii;

            for &child_node_index in &current_node.children {
                let child_node = &process_type_tree[child_node_index];
                let step = child_node
                    .step
                    .expect("non-root process tree nodes always cache a transform step");
                let current_text = text_masks[current_index].text.as_ref();
                let output = step.apply(current_text, parent_is_ascii);

                let old_len = if LAZY { text_masks.len() } else { 0 };
                let child_index = dedup_insert(
                    &mut text_masks,
                    current_index,
                    output.changed,
                    output.is_ascii,
                );

                if LAZY {
                    while scanned_masks.len() < text_masks.len() {
                        scanned_masks.push(0u64);
                    }
                }

                ts.tree_node_indices[child_node_index] = child_index;
                text_masks[child_index].mask |= child_node.pt_index_mask;

                if LAZY && child_index >= old_len {
                    // New unique text: call on_variant immediately.
                    let mask = text_masks[child_index].mask;
                    let is_ascii = text_masks[child_index].is_ascii;
                    if mask != 0
                        && on_variant(
                            text_masks[child_index].text.as_ref(),
                            child_index,
                            mask,
                            is_ascii,
                        )
                    {
                        stopped = true;
                        break 'walk;
                    }
                    scanned_masks[child_index] = mask;
                }
                // Dedup'd entry: mask may have grown; handled in delta phase below.
            }
        }

        if LAZY {
            if stopped {
                return (text_masks, true);
            }
            // Delta phase: re-scan entries whose mask grew after their initial callback.
            for i in 0..text_masks.len() {
                let delta = text_masks[i].mask & !scanned_masks[i];
                if delta != 0
                    && on_variant(
                        text_masks[i].text.as_ref(),
                        i,
                        delta,
                        text_masks[i].is_ascii,
                    )
                {
                    return (text_masks, true);
                }
            }
        }

        (text_masks, false)
    }
}