disarm 0.10.0

Unicode canonicalization and TR39 confusable analysis: building blocks for text-security pipelines (homoglyph/bidi/zalgo handling) plus standards-based transliteration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
//! Fast Unicode transliteration, slugification, and text normalization.
//!
//! `disarm` is a pure-Rust core (no Python, no pyo3 in the default build) for
//! Unicode text-security and canonicalization: TR39 confusable folding, bidi /
//! zero-width / zalgo neutralization, normalization, grapheme/width measurement,
//! slugification, and standards-based transliteration.
//!
//! The public Rust API lives in [`mod@crate::api`]; the error types are
//! [`Error`], [`ErrorKind`], and [`ErrorMode`]. Everything else is an internal
//! implementation detail (`pub(crate)` or `#[doc(hidden)]`) and carries no
//! stability guarantee — see `docs/RUST_API.md`.
//!
//! The Python extension (`disarm._core`) is an opt-in layer behind the
//! `extension-module` feature and is not built into the default crate.
//!
//! ```
//! use disarm::{api, DisarmStr};
//! // ASCII passes through unchanged; non-ASCII is romanized to ASCII.
//! assert_eq!(api::strip_accents("café"), "cafe");
//! assert_eq!(api::transliterate("Москва"), "Moskva");
//! // …or via the extension trait:
//! assert_eq!("Москва".transliterate(), "Moskva");
//! ```

// In the pure crates.io build (`default = []`, no `extension-module`), the
// shim-backing Layer-1 helpers — the Python-entry validators/dispatchers
// (`transliterate_context`, the `register_*` cores, `ErrorMode::parse`, the
// `Pipeline` builder, …) that the binding shims call but `crate::api` does not —
// are legitimately unused. Allow that here; genuinely-dead code is still caught
// by the `--features extension-module` clippy run, where every path is live.
#![cfg_attr(not(feature = "extension-module"), allow(dead_code))]

#[cfg(feature = "extension-module")]
use pyo3::prelude::*;

// #208: the opt-in logging facade. `#[macro_use]` so the `tl_*!` macros are
// available crate-wide; declared first so every later module can use them. With
// the `log` feature off they expand to nothing (zero cost, no `log` dependency).
#[macro_use]
mod obs;

// Shared utilities and error construction.
#[doc(hidden)]
pub mod utils;

// Pure-Rust error enum + the single PyO3 boundary conversion (#181).
pub(crate) mod error;
pub(crate) use error::ErrorRepr;
pub use error::{Error, ErrorKind};

/// Error handling mode for operations that encounter untranslatable/unknown input.
///
/// Shared across transliterate, emoji, and other modules that need the
/// replace/ignore/preserve trichotomy.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ErrorMode {
    /// Substitute unknown input with a replacement string.
    Replace,
    /// Silently drop unknown input.
    Ignore,
    /// Pass unknown input through unchanged.
    Preserve,
}

impl ErrorMode {
    /// The canonical lowercase token: `"replace"` / `"ignore"` / `"preserve"`.
    #[must_use]
    pub fn as_str(self) -> &'static str {
        match self {
            Self::Replace => "replace",
            Self::Ignore => "ignore",
            Self::Preserve => "preserve",
        }
    }

    /// Pure-Rust parse of an error mode string, returning the core `ErrorRepr`.
    pub(crate) fn parse(s: &str) -> Result<Self, crate::ErrorRepr> {
        match s {
            "replace" => Ok(Self::Replace),
            "ignore" => Ok(Self::Ignore),
            "preserve" => Ok(Self::Preserve),
            _ => Err(crate::ErrorRepr::InvalidErrorMode { got: s.to_owned() }),
        }
    }
}

impl std::fmt::Display for ErrorMode {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.as_str())
    }
}

impl std::str::FromStr for ErrorMode {
    type Err = crate::Error;

    /// Parse `"replace"` / `"ignore"` / `"preserve"`; the Python binding and Rust
    /// callers share this one validated path ("parse, don't validate").
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Self::parse(s).map_err(crate::Error::from)
    }
}

// Layer 2: the idiomatic, pyo3-free Rust API — the published crates.io surface
// (#38/#42). This and the error types (`Error`, `ErrorKind`, `ErrorMode`) are the
// ONLY public, semver-governed Rust API; everything below is `pub(crate)`.
pub mod api;

/// The string extension trait, re-exported at the crate root so callers can
/// `use disarm::DisarmStr;` for method syntax over the [`api`] functions (#352).
pub use api::DisarmStr;

// Layer 1: the pure-Rust algorithm cores. `pub(crate)` — reachable by `api` and
// the PyO3 shims, but not part of the public crate surface (#42).
pub(crate) mod case_fold;
pub(crate) mod confusables;
pub(crate) mod context;
pub(crate) mod encoders;
pub(crate) mod encoding;
pub(crate) mod filename;
pub(crate) mod grapheme;
pub(crate) mod hostname;
pub(crate) mod limits;
pub(crate) mod log_injection;
pub(crate) mod normalize;
pub(crate) mod pipeline;
pub(crate) mod presets;
pub(crate) mod reverse;
pub(crate) mod scripts;
pub(crate) mod slugify;
pub(crate) mod unicode_ranges;
pub(crate) mod whitespace;
pub(crate) mod width;
pub(crate) mod zalgo;

// `#[doc(hidden)] pub` rather than `pub(crate)`: these three carry deep
// implementation entrypoints that the in-repo Criterion/iai benchmarks (separate
// crates, so they can only see `pub` items) measure directly. `#[doc(hidden)]`
// keeps them off docs.rs and out of the semver contract (cargo-semver-checks
// ignores hidden items) — they are NOT public API. See docs/RUST_API.md.
#[doc(hidden)]
pub mod emoji;
#[doc(hidden)]
pub mod transliterate;
// Generated PHF code contains unseparated integer literals and non-NFC
// Unicode confusable characters (which is the point of the confusables table).
#[allow(clippy::unreadable_literal, clippy::unicode_not_nfc)]
#[doc(hidden)]
pub mod tables;

// Layer 3b: the PyO3 binding shims (#38). Gated behind `feature = "extension-module"`
// (#42): `pyo3` is an optional dependency, so the pure crates.io core (`default = []`)
// builds without it. The shims, the `#[pymodule]`, the exception types, the
// `ErrorRepr -> PyErr` conversion, and `emit_py_warning` are all under this feature.
#[cfg(feature = "extension-module")]
#[doc(hidden)]
mod py;

/// The private compiled extension module, imported as `disarm._core` (the public
/// Python API in `python/disarm/__init__.py` wraps it). Not a public interface.
#[cfg(feature = "extension-module")]
#[pymodule]
#[pyo3(name = "_core")]
fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
    // Core transforms
    m.add_function(wrap_pyfunction!(py::transliterate::_transliterate, m)?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_transliterate_entry,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_set_transliterate_fallback,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_validate_transliterate_args,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_find_untranslatable,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_transliterate_context,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(py::transliterate::_strip_accents, m)?)?;
    m.add_function(wrap_pyfunction!(py::transliterate::_is_ascii, m)?)?;
    m.add_function(wrap_pyfunction!(py::transliterate::_list_langs, m)?)?;
    m.add_function(wrap_pyfunction!(py::transliterate::_register_lang, m)?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_register_replacements,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(py::transliterate::_remove_replacement, m)?)?;
    m.add_function(wrap_pyfunction!(py::transliterate::_clear_replacements, m)?)?;
    m.add_function(wrap_pyfunction!(py::transliterate::_seal_registrations, m)?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_registrations_sealed,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(py::slugify::_slugify, m)?)?;
    m.add_function(wrap_pyfunction!(
        py::log_injection::_strip_log_injection,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(py::normalize::_normalize, m)?)?;
    m.add_function(wrap_pyfunction!(py::normalize::_is_normalized, m)?)?;
    m.add_function(wrap_pyfunction!(
        py::confusables::_normalize_confusables,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(py::confusables::_is_confusable, m)?)?;
    m.add_function(wrap_pyfunction!(py::encoders::_escape_html, m)?)?;
    m.add_function(wrap_pyfunction!(py::encoders::_percent_encode, m)?)?;
    m.add_function(wrap_pyfunction!(py::filename::_sanitize_filename, m)?)?;
    m.add_function(wrap_pyfunction!(py::case_fold::_fold_case, m)?)?;
    m.add_function(wrap_pyfunction!(py::whitespace::_collapse_whitespace, m)?)?;
    m.add_function(wrap_pyfunction!(py::scripts::_detect_scripts, m)?)?;
    m.add_function(wrap_pyfunction!(py::scripts::_is_mixed_script, m)?)?;
    m.add_function(wrap_pyfunction!(py::scripts::_inspect_auto_lang, m)?)?;

    // Batch APIs (single PyO3 boundary crossing for N strings)
    m.add_function(wrap_pyfunction!(
        py::transliterate::_transliterate_batch,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(
        py::transliterate::_strip_accents_batch,
        m
    )?)?;
    m.add_function(wrap_pyfunction!(py::slugify::_slugify_batch, m)?)?;
    m.add_function(wrap_pyfunction!(py::normalize::_normalize_batch, m)?)?;

    // Stateful classes
    m.add_class::<py::slugify::_Slugifier>()?;
    m.add_class::<py::slugify::_UniqueSlugifier>()?;
    m.add_class::<py::pipeline::_TextPipeline>()?;
    m.add_function(wrap_pyfunction!(py::pipeline::_get_pipeline, m)?)?;
    m.add_function(wrap_pyfunction!(py::pipeline::_list_profiles, m)?)?;

    // Precompiled pipelines
    m.add_function(wrap_pyfunction!(py::presets::_security_clean, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_ml_normalize, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_catalog_key, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_display_clean, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_search_key, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_sort_key, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_strip_bidi, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_normalize_user_input, m)?)?;
    m.add_function(wrap_pyfunction!(py::presets::_strip_obfuscation, m)?)?;

    // Zalgo detection and stripping
    m.add_function(wrap_pyfunction!(py::zalgo::_is_zalgo, m)?)?;
    m.add_function(wrap_pyfunction!(py::zalgo::_strip_zalgo, m)?)?;

    // Grapheme cluster functions
    m.add_function(wrap_pyfunction!(py::grapheme::_grapheme_len, m)?)?;
    m.add_function(wrap_pyfunction!(py::grapheme::_grapheme_split, m)?)?;
    m.add_function(wrap_pyfunction!(py::grapheme::_grapheme_truncate, m)?)?;
    m.add_function(wrap_pyfunction!(py::width::_terminal_width, m)?)?;
    m.add_function(wrap_pyfunction!(py::width::_grapheme_width, m)?)?;

    // Hostname safety
    m.add_function(wrap_pyfunction!(py::hostname::_is_suspicious_hostname, m)?)?;
    m.add_class::<py::hostname::HostnameAnalysis>()?;

    // Encoding detection
    m.add_function(wrap_pyfunction!(py::encoding::_detect_encoding, m)?)?;
    m.add_function(wrap_pyfunction!(py::encoding::_decode_to_utf8, m)?)?;

    // Reverse transliteration
    m.add_function(wrap_pyfunction!(py::reverse::_reverse_transliterate, m)?)?;
    m.add_function(wrap_pyfunction!(py::reverse::_reverse_langs, m)?)?;

    // Emoji
    m.add_function(wrap_pyfunction!(py::emoji::_demojize, m)?)?;
    m.add_function(wrap_pyfunction!(py::emoji::_set_emoji_provider, m)?)?;

    // Custom exception hierarchy (#183): DisarmError base + categorised
    // subclasses. The Error -> PyErr conversion maps each variant to one of these.
    m.add("DisarmError", m.py().get_type::<DisarmError>())?;
    m.add(
        "InvalidArgumentError",
        m.py().get_type::<InvalidArgumentError>(),
    )?;
    m.add(
        "ResourceLimitError",
        m.py().get_type::<ResourceLimitError>(),
    )?;
    m.add("UnsupportedError", m.py().get_type::<UnsupportedError>())?;

    // Resource limits exposed so the Python wrapper reads them from this single
    // source instead of re-declaring the literal and risking silent drift (#200).
    m.add("_MAX_BATCH_SIZE", MAX_BATCH_SIZE)?;

    Ok(())
}

/// Maximum number of strings in a batch API call.
///
/// Prevents excessive memory allocation from a single Python call.
/// 100,000 strings is generous for any real workload; callers with
/// larger datasets should chunk.
pub(crate) const MAX_BATCH_SIZE: usize = 100_000;

/// Number of inputs extracted from the Python list and processed per chunk in
/// the batch entry points (#239). Bounds peak Rust-side input residency to one
/// chunk rather than the whole batch, while keeping the GIL-release/compute
/// ratio favourable (the GIL is released once per chunk).
pub(crate) const BATCH_CHUNK_SIZE: usize = 64;

/// Recover from a poisoned `RwLock` or `Mutex` guard (read **or** write).
///
/// A poisoned lock means a thread panicked while holding it.  For **read**
/// guards, the data is structurally valid (no partial write occurred).  For
/// **write** guards, the data may have been partially modified before the panic;
/// correctness of the recovered state is the **caller's responsibility** — the
/// caller must decide whether to continue, reset, or propagate an error.
/// We log a diagnostic and return the guard rather than propagating the panic
/// to every subsequent caller. (#126)
pub(crate) fn recover_lock<T>(result: std::sync::LockResult<T>, table_name: &str) -> T {
    result.unwrap_or_else(|e| {
        // #117: identify WHICH lock was recovered and route the diagnostic
        // through Python's warnings module (a UserWarning via warnings.warn) so
        // that Python applications can capture it via the `warnings`/`logging`
        // APIs, falling back to stderr.
        // #208: binding-neutral record — `table_name` only, no content. Dual-emit
        // alongside the existing warnings/stderr path so a Python consumer watching
        // only `warnings` is unaffected (open question 4); a binding's `log` sink
        // gets the structured record.
        tl_error!("lock poisoned, recovered: table={table_name:?}");
        // H3: `{table_name:?}` (not plain `{}`) so a dynamic name could never inject
        // CR/LF into this warning; all current callers pass static literals, and the
        // debug quoting reads cleanly (`lock for "REGEX_CACHE"`).
        let msg = format!(
            "disarm: lock for {table_name:?} poisoned (a thread panicked while holding the \
             lock). Recovering from poisoned state — data may be inconsistent. This is a bug; \
             please report it."
        );
        // `recover_lock` has no `Python<'_>` token, so attach to the interpreter
        // here. `attach` panics if no interpreter is initialized (pyo3 is built
        // without `auto-initialize`): the shipped extension always has one live,
        // but a pure-Rust caller may not — and lock-poison recovery must stay
        // non-fatal. Catch that panic and fall back to stderr so recovery never
        // aborts. (#117)
        // The Python `warnings.warn` route only exists in the extension build; the
        // pure crates.io core (no pyo3) goes straight to stderr.
        #[cfg(feature = "extension-module")]
        {
            let emitted = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                pyo3::Python::attach(|py| emit_py_warning(py, &msg));
            }));
            if emitted.is_err() {
                emit_warning_stderr(&msg);
            }
        }
        #[cfg(not(feature = "extension-module"))]
        emit_warning_stderr(&msg);
        e.into_inner()
    })
}

/// Emit a warning to stderr.
///
/// Used by `recover_lock` and other non-PyO3 paths where a `Python<'_>` token
/// is not available. Python-context callers should prefer `emit_py_warning`
/// (which routes through `warnings.warn`) when a GIL token is at hand.
pub(crate) fn emit_warning_stderr(msg: &str) {
    // Callers already prefix their messages with "disarm: ..."; emit as-is to
    // avoid a double "disarm warning: disarm: ..." prefix (review on #106).
    eprintln!("{msg}");
}

/// Emit a Python `UserWarning` via `warnings.warn`, falling back to stderr if
/// the `warnings.warn` call itself fails. (#106) Requires a `Python<'_>` token.
///
/// Prefer this over bare `eprintln!` whenever a `Python<'_>` token is at hand
/// so that Python applications can capture and redirect diagnostics.
/// Non-PyO3 callsites that lack a `Python<'_>` token should use
/// `emit_warning_stderr`, or attach to the interpreter via `pyo3::Python::attach`
/// — but note `attach` panics if no interpreter is initialized, so guard it (as
/// `recover_lock` does on the poison path: catch the panic, fall back to stderr).
#[cfg(feature = "extension-module")]
pub(crate) fn emit_py_warning(py: pyo3::Python<'_>, msg: &str) {
    if py
        .import("warnings")
        .and_then(|w| w.call_method1("warn", (msg,)))
        .is_err()
    {
        emit_warning_stderr(msg);
    }
}

// NOTE: a previous `recover_lock_or_clear` reset the protected table to its
// default on poison. That silently wiped one caller's registrations when an
// unrelated thread panicked (#64) — a multi-tenant blast-radius hazard. The
// registration tables now use `recover_lock` (recover the data as-is; a panic
// leaves a std collection in a valid-but-unspecified state, never UB).

#[cfg(feature = "extension-module")]
pyo3::create_exception!(
    disarm,
    DisarmError,
    pyo3::exceptions::PyValueError,
    "Base exception for every error disarm raises.\n\
     Subclass of ``ValueError`` (so existing ``except ValueError`` code keeps\n\
     working); catch ``DisarmError`` to handle any disarm failure. The\n\
     subclasses below categorise the failure (#183)."
);

#[cfg(feature = "extension-module")]
pyo3::create_exception!(
    disarm,
    InvalidArgumentError,
    DisarmError,
    "An argument had an invalid value or a combination of arguments was\n\
     contradictory (e.g. an unknown ``errors``/``form``/``lang`` value, or two\n\
     mutually-exclusive flags). Subclass of ``disarm.DisarmError``."
);

#[cfg(feature = "extension-module")]
pyo3::create_exception!(
    disarm,
    ResourceLimitError,
    DisarmError,
    "A configured resource limit was exceeded (batch size, registration cap,\n\
     regex length, unique-slug attempts). Subclass of ``disarm.DisarmError``."
);

#[cfg(feature = "extension-module")]
pyo3::create_exception!(
    disarm,
    UnsupportedError,
    DisarmError,
    "A requested operation is not supported (e.g. reverse transliteration for a\n\
     language, or auto-detecting an encoding). Subclass of ``disarm.DisarmError``."
);