1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
//! Pre-compiled and source-text constants for text-transformation pipelines.
//!
//! All items are conditional on feature flags:
//!
//! - `runtime_build` — exposes raw text-map string constants (`FANJIAN`, `TEXT_DELETE`, etc.)
//! that are parsed on first use to build transformation tables dynamically.
//! - default (`not(runtime_build)`) — exposes pre-compiled binary constants (`*_L1_BYTES`,
//! `*_L2_BYTES`, `*_BYTES`, `*_STR`) embedded at build time by `build.rs` and decoded
//! lazily when the corresponding matcher is first requested.
// ── runtime_build: source text maps ─────────────────────────────────────────
/// Tab-separated `(traditional, simplified)` codepoint pairs, one per line.
///
/// Used by the step registry under `runtime_build` to
/// build the Fanjian 2-stage page table at startup.
pub const FANJIAN: &str = include_str!;
/// Newline-separated characters that should be removed by the Delete step.
///
/// Used under `runtime_build` to populate the Delete BitSet.
pub const TEXT_DELETE: &str = include_str!;
/// Tab-separated `(source, normalized)` pairs for digit/number normalization.
///
/// Merged with [`NORM`] to build the Normalize Aho-Corasick automaton under `runtime_build`.
pub const NUM_NORM: &str = include_str!;
/// Tab-separated `(source, normalized)` pairs for general Unicode normalization
/// (full-width→half-width, variant forms, etc.).
///
/// Merged with [`NUM_NORM`] to build the Normalize automaton under `runtime_build`.
pub const NORM: &str = include_str!;
/// Tab-separated `(character, pinyin_with_spaces)` pairs covering CJK codepoints.
///
/// Used under `runtime_build` to build the Pinyin 2-stage page table and string buffer.
pub const PINYIN: &str = include_str!;
/// All Unicode codepoints considered whitespace for the Delete step.
///
/// Includes standard ASCII control characters plus selected Unicode space variants
/// (selected codepoints from U+2000–U+200F such as U+200D/U+200F, line/paragraph separators,
/// ideographic space, etc.).
/// Loaded at runtime under `runtime_build` to populate the Delete BitSet alongside
/// [`TEXT_DELETE`].
pub const WHITE_SPACE: & = &;
// ── default build: pre-compiled normalization automaton ──────────────────────
/// Newline-separated source patterns for the Normalize matcher.
///
/// Loaded via `include_str!` from the `OUT_DIR` artifact produced by `build.rs`.
/// Used when the `dfa` feature is enabled and `runtime_build` is disabled.
pub const NORMALIZE_PROCESS_LIST_STR: &str =
include_str!;
/// Pre-serialized `daachorse` matcher for the Normalize step.
///
/// Loaded via `include_bytes!` from the `OUT_DIR` artifact produced by `build.rs`.
/// Only used when `dfa` is disabled and `runtime_build` is disabled.
pub const NORMALIZE_PROCESS_MATCHER_BYTES: & = include_bytes!;
/// Newline-separated replacement strings parallel to the Normalize pattern list.
///
/// Index `i` is the replacement for pattern `i` in `NORMALIZE_PROCESS_LIST_STR` (DFA) or
/// the pattern order in `NORMALIZE_PROCESS_MATCHER_BYTES` (DAAC). Loaded from `OUT_DIR`.
pub const NORMALIZE_PROCESS_REPLACE_LIST_STR: &str = include_str!;
// ── default build: Fanjian page tables ──────────────────────────────────────
/// L1 index for the Fanjian 2-stage page table (`u16[4352]`, little-endian).
///
/// See [`crate::process::transform::charwise::FanjianMatcher`]
/// for the table layout.
pub const FANJIAN_L1_BYTES: & =
include_bytes!;
/// L2 data for the Fanjian 2-stage page table (`u32[num_pages * 256]`, little-endian).
pub const FANJIAN_L2_BYTES: & =
include_bytes!;
// ── default build: Pinyin page tables ───────────────────────────────────────
/// L1 index for the Pinyin 2-stage page table (`u16[4352]`, little-endian).
pub const PINYIN_L1_BYTES: & =
include_bytes!;
/// L2 data for the Pinyin 2-stage page table (`u32[num_pages * 256]`, little-endian).
///
/// Each entry packs `(offset << 8) | length` into a `u32`, pointing into [`PINYIN_STR_BYTES`].
pub const PINYIN_L2_BYTES: & =
include_bytes!;
/// Concatenated Pinyin syllable strings referenced by [`PINYIN_L2_BYTES`].
///
/// Individual mappings may include surrounding spaces; `PinYinChar` trims those boundaries
/// after lookup.
pub const PINYIN_STR_BYTES: &str = include_str!;
// ── default build: Delete BitSet ─────────────────────────────────────────────
/// Flat 139 KB bitset (`u8[139264]`) covering all Unicode codepoints 0x0–0x10FFFF.
///
/// Bit `cp % 8` of byte `cp / 8` is set when codepoint `cp` should be removed by the
/// Delete step. Generated at build time from `TEXT-DELETE.txt` and `WHITE_SPACE`.
pub const DELETE_BITSET_BYTES: & =
include_bytes!;