1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
//! Pre-compiled and source-text constants for text-transformation pipelines.
//!
//! All items are conditional on feature flags:
//!
//! - `runtime_build` — exposes raw text-map string constants (`FANJIAN`, `TEXT_DELETE`, etc.)
//! that are parsed at startup to build transformation tables dynamically.
//! - default (`not(runtime_build)`) — exposes pre-compiled binary constants (`*_L1_BYTES`,
//! `*_L2_BYTES`, `*_BYTES`, `*_STR`) embedded at build time by `build.rs` for zero-startup-
//! cost loading.
// ── runtime_build: source text maps ─────────────────────────────────────────
/// Tab-separated `(traditional, simplified)` codepoint pairs, one per line.
///
/// Used by [`get_process_matcher`](crate::get_process_matcher) under `runtime_build` to
/// build the Fanjian 2-stage page table at startup.
pub const FANJIAN: &str = include_str!;
/// Newline-separated characters (and ranges) that should be removed by the Delete step.
///
/// Used under `runtime_build` to populate the Delete BitSet.
pub const TEXT_DELETE: &str = include_str!;
/// Tab-separated `(source, normalized)` pairs for digit/number normalization.
///
/// Merged with [`NORM`] to build the Normalize Aho-Corasick automaton under `runtime_build`.
pub const NUM_NORM: &str = include_str!;
/// Tab-separated `(source, normalized)` pairs for general Unicode normalization
/// (full-width→half-width, variant forms, etc.).
///
/// Merged with [`NUM_NORM`] to build the Normalize automaton under `runtime_build`.
pub const NORM: &str = include_str!;
/// Tab-separated `(character, pinyin_with_spaces)` pairs covering CJK codepoints.
///
/// Used under `runtime_build` to build the Pinyin 2-stage page table and string buffer.
pub const PINYIN: &str = include_str!;
/// All Unicode codepoints considered whitespace for the Delete step.
///
/// Includes standard ASCII control characters plus a wide range of Unicode space variants
/// (General Punctuation U+2000–U+200F, line/paragraph separators, ideographic space, etc.).
/// Loaded at runtime under `runtime_build` to populate the Delete BitSet alongside
/// [`TEXT_DELETE`].
pub const WHITE_SPACE: & = &;
// ── default build: pre-compiled normalization automaton ──────────────────────
/// Newline-separated source patterns for the Normalize Aho-Corasick DFA.
///
/// Loaded via `include_str!` from the `OUT_DIR` binary artifact produced by `build.rs`.
/// Only used when the `dfa` feature is enabled and `runtime_build` is disabled.
pub const NORMALIZE_PROCESS_LIST_STR: &str =
include_str!;
/// Pre-serialized `daachorse` double-array Aho-Corasick matcher for the Normalize step.
///
/// Loaded via `include_bytes!` from the `OUT_DIR` artifact produced by `build.rs`.
/// Only used when `dfa` is disabled and `runtime_build` is disabled.
pub const NORMALIZE_PROCESS_MATCHER_BYTES: & = include_bytes!;
/// Newline-separated replacement strings parallel to the Normalize pattern list.
///
/// Index `i` is the replacement for pattern `i` in `NORMALIZE_PROCESS_LIST_STR` (DFA) or
/// the pattern order in `NORMALIZE_PROCESS_MATCHER_BYTES` (DAAC). Loaded from `OUT_DIR`.
pub const NORMALIZE_PROCESS_REPLACE_LIST_STR: &str = include_str!;
// ── default build: Fanjian page tables ──────────────────────────────────────
/// L1 index for the Fanjian 2-stage page table (`u16[4352]`, little-endian).
///
/// See [`SingleCharMatcher::Fanjian`](crate::process::single_char_matcher::SingleCharMatcher)
/// for the full layout description.
pub const FANJIAN_L1_BYTES: & = include_bytes!;
/// L2 data for the Fanjian 2-stage page table (`u32[num_pages * 256]`, little-endian).
pub const FANJIAN_L2_BYTES: & = include_bytes!;
// ── default build: Pinyin page tables ───────────────────────────────────────
/// L1 index for the Pinyin 2-stage page table (`u16[4352]`, little-endian).
pub const PINYIN_L1_BYTES: & = include_bytes!;
/// L2 data for the Pinyin 2-stage page table (`u32[num_pages * 256]`, little-endian).
///
/// Each entry packs `(offset << 8) | length` into a `u32`, pointing into [`PINYIN_STR_BYTES`].
pub const PINYIN_L2_BYTES: & = include_bytes!;
/// Concatenated Pinyin syllable strings referenced by [`PINYIN_L2_BYTES`].
///
/// Individual syllables are separated by spaces; `PinYinChar` mode trims them after lookup.
pub const PINYIN_STR_BYTES: &str = include_str!;
// ── default build: Delete BitSet ─────────────────────────────────────────────
/// Flat 139 KB BitSet (`u8[139264]`) covering all Unicode codepoints 0x0–0x10FFFF.
///
/// Bit `cp % 8` of byte `cp / 8` is set when codepoint `cp` should be removed by the
/// Delete step. Generated at build time from `TEXT-DELETE.txt` and `WHITE_SPACE`.
pub const DELETE_BITSET_BYTES: & =
include_bytes!;