disarm 0.10.0 - Docs.rs

//! Pure-Rust error type for the disarm core.
//!
//! `Error` is the single internal error enum constructed by the pure-Rust
//! helper functions.  It carries the structured fields needed to render each
//! message via its `Display` impl (`thiserror`). All messages follow one house
//! style (see [`Error`](crate::Error)), enforced by `messages_follow_house_style` (#187).
//!
//! The PyO3 boundary converts `Error` into a Python exception via
//! [`From<Error> for pyo3::PyErr`].  That conversion is the **only** place the
//! core couples to PyO3: it maps each variant into disarm's unified exception
//! hierarchy — the `DisarmError` base or one of its `InvalidArgumentError` /
//! `ResourceLimitError` / `UnsupportedError` subclasses (#183), so
//! `except DisarmError` catches every variant (it missed the five
//! formerly-bare-`PyValueError` sites before #183).
//!
//! Each variant also exposes a stable machine-readable [`ErrorRepr::code`] — new
//! metadata that is not yet surfaced to Python and changes no behaviour.

use std::borrow::Cow;

use thiserror::Error;

/// Maximum length (bytes) of user-controlled input echoed into an error message.
const MAX_ERROR_INPUT_BYTES: usize = 80;

/// Bound user-controlled text embedded in an error message (#200): a long or
/// sensitive input must not bloat the message or be logged in full. Truncates on
/// a UTF-8 char boundary and appends an ellipsis; short inputs pass through
/// borrowed and unchanged. Also the single truncation point reused by the
/// `log-content` TRACE escape hatch (`tl_trace_content!`, #208).
pub(crate) fn truncate_error_text(text: &str) -> Cow<'_, str> {
    if text.len() <= MAX_ERROR_INPUT_BYTES {
        return Cow::Borrowed(text);
    }
    let mut end = MAX_ERROR_INPUT_BYTES;
    while !text.is_char_boundary(end) {
        end -= 1;
    }
    Cow::Owned(format!("{}…", &text[..end]))
}

/// Internal error type for the disarm core.
///
/// One variant per distinct error currently constructed across the core.
///
/// ## Message style (#187)
///
/// Every `#[error(...)]` message follows one policy, enforced by
/// `messages_follow_house_style` below:
/// - **No prefix.** The exception *type* already identifies disarm, so messages
///   carry no `disarm:` prefix.
/// - **Lowercase first word**, unless it is a proper noun / identifier
///   (`UniqueSlugifier`, `register_lang()`, `max_length`, `strict_iso9`).
/// - **Single quotes** around an echoed string value: `got '{got}'`, not
///   `{got:?}`. The exceptions are the three caller-controlled arbitrary-content
///   fields — the regex `pattern`, the `register_lang` bad `keys`, and the
///   unique-slug `separator` — which use `{:?}` (double quotes) so embedded
///   quotes or control characters stay unambiguous. (The `Untranslatable` char
///   also uses `{:?}`, but `char` Debug already renders single quotes while
///   safely escaping control characters.)
/// - **Names the offending value and a remedy** (valid options, a limit, a
///   command, or a flag to pass) wherever one applies.
#[derive(Debug, Error)]
pub(crate) enum ErrorRepr {
    /// Invalid `errors=` mode string (`ErrorMode::parse`).
    #[error("errors must be 'replace', 'ignore', or 'preserve', got '{got}'")]
    InvalidErrorMode {
        /// The offending value.
        got: String,
    },

    /// Invalid normalization form for `normalize`/`is_normalized`.
    #[error("form must be 'NFC', 'NFD', 'NFKC', or 'NFKD', got '{got}'")]
    InvalidNormForm {
        /// The offending value.
        got: String,
    },

    /// Invalid normalization form for the `TextPipeline` `normalize` step.
    #[error("normalize must be 'NFC', 'NFD', 'NFKC', or 'NFKD', got '{got}'")]
    InvalidPipelineNormForm {
        /// The offending value.
        got: String,
    },

    /// Invalid `emoji_style` for `ml_normalize`.
    #[error("emoji_style must be 'cldr' or 'none', got '{got}'")]
    InvalidEmojiStyle {
        /// The offending value.
        got: String,
    },

    /// Invalid `platform` for `sanitize_filename`.
    #[error("platform must be 'universal', 'windows', or 'posix', got '{got}'")]
    InvalidPlatform {
        /// The offending value.
        got: String,
    },

    /// Invalid `target_script` for confusables functions.
    #[error("target_script must be 'latin' or 'cyrillic', got '{got}'")]
    InvalidTargetScript {
        /// The offending value.
        got: String,
    },

    /// Invalid transliteration scheme string (`Scheme::from_str`, #352).
    #[error("scheme must be 'default', 'strict_iso9', or 'gost7034', got '{got}'")]
    InvalidScheme {
        /// The offending value.
        got: String,
    },

    /// Invalid `UrlComponent` token (`UrlComponent::from_str`).
    #[error("url component must be 'path', 'segment', 'query', or 'form', got '{got}'")]
    InvalidUrlComponent {
        /// The offending value.
        got: String,
    },

    /// Invalid `ReverseLang` code (`ReverseLang::from_str`).
    #[error("reverse language must be 'el', 'ru', or 'uk', got '{got}'")]
    InvalidReverseLang {
        /// The offending value.
        got: String,
    },

    /// Unknown `lang` code (eager validation, #68).
    #[error(
        "unknown language code '{got}'{suggestion}; expected 'auto', a BCP-47 \
         alias (nb, nn, da), or one of: {valid}"
    )]
    UnknownLang {
        /// The offending code (rendered with `{:?}` to match the original).
        got: String,
        /// Pre-rendered " (did you mean 'xx'?)" hint, or empty (#186).
        suggestion: String,
        /// Comma-joined list of valid codes.
        valid: String,
    },

    /// Mutually-exclusive flags, raised at the transliterate scalar/context/batch
    /// entry points (#130). Maps to `InvalidArgumentError` (#183).
    #[error("strict_iso9 and gost7034 are mutually exclusive")]
    MutuallyExclusiveBare,

    /// Mutually-exclusive flags, raised at the `TextPipeline` constructor. Same
    /// message and `InvalidArgumentError` mapping as `MutuallyExclusiveBare`;
    /// the two are kept distinct only by construction site (a future cleanup
    /// could merge them now that the Py-type split they encoded is gone).
    #[error("strict_iso9 and gost7034 are mutually exclusive")]
    MutuallyExclusivePipeline,

    /// Batch size over [`crate::MAX_BATCH_SIZE`].
    #[error("batch too large ({len} items); maximum is {max} items")]
    BatchTooLarge {
        /// The submitted batch length.
        len: usize,
        /// The maximum allowed.
        max: usize,
    },

    /// Registered replacements expanded the input past the output cap.
    #[error(
        "registered replacements expanded the input to {size} bytes, exceeding the {max} byte limit"
    )]
    ReplacementOutputTooLarge {
        /// The produced size in bytes.
        size: usize,
        /// The maximum allowed.
        max: usize,
    },

    /// A registration mutation attempted after `seal_registrations()`.
    #[error(
        "{op}: registration tables are sealed (seal_registrations() was called); \
         register/remove/clear are not permitted after sealing"
    )]
    Sealed {
        /// The attempted operation name.
        op: String,
    },

    /// `register_lang` would exceed the registered-language cap. Maps to
    /// `ResourceLimitError` (#183).
    #[error(
        "register_lang(): maximum of {max} registered languages reached; \
         re-registering an existing code is still allowed"
    )]
    RegisterLangLimit {
        /// The maximum number of registered languages.
        max: usize,
    },

    /// `register_lang` mapping keys are not single Unicode characters. Maps to
    /// `InvalidArgumentError` (#183).
    #[error(
        "register_lang(): mapping keys must be exactly one Unicode character; \
         invalid keys: {keys}"
    )]
    RegisterLangBadKeys {
        /// Pre-rendered, comma-joined list of the offending keys (each `{:?}`).
        keys: String,
    },

    /// `register_replacements` would exceed the replacements cap. Maps to
    /// `ResourceLimitError` (#183).
    #[error(
        "register_replacements(): table would exceed the maximum of {max} entries \
         (projected size: {projected}); call clear_replacements() first"
    )]
    RegisterReplacementsLimit {
        /// The maximum number of replacement entries.
        max: usize,
        /// The projected table size.
        projected: usize,
    },

    /// No context dictionary found for the language.
    #[error(
        "context dictionary for {lang} not found; context-aware transliteration needs \
         the prebuilt dictionaries: run `bash scripts/bootstrap_dicts.sh` (from a \
         source checkout) and set the DISARM_DICT_DIR environment variable to the \
         output directory (see docs/user-guide/abjad-transliteration.md)"
    )]
    ContextDictNotFound {
        /// Human-readable language name (e.g. "Arabic").
        lang: String,
    },

    /// Context dictionary found but corrupt (#107).
    #[error(
        "context dictionary for {lang} is corrupt and could not be loaded: {reason}; \
         rebuild it with `bash scripts/bootstrap_dicts.sh` (from a source checkout)"
    )]
    ContextDictCorrupt {
        /// Human-readable language name (e.g. "Arabic").
        lang: String,
        /// The underlying corruption detail (flattened sub-error text, #188).
        reason: String,
    },

    /// Caller-supplied regex pattern exceeds the byte cap (slugify).
    #[error("regex_pattern is too long ({len} bytes); maximum is {max} bytes")]
    RegexTooLong {
        /// The pattern length in bytes.
        len: usize,
        /// The maximum allowed.
        max: usize,
    },

    /// Caller-supplied regex failed to compile. Echoes the offending pattern
    /// (#186); the `regex::Error` Display carries the position/caret detail.
    /// Cause-chain enrichment is #188.
    #[error("invalid regex_pattern {pattern:?}: {source}")]
    RegexCompile {
        /// The offending pattern, echoed back to the caller.
        pattern: String,
        /// The underlying regex compile error.
        source: regex::Error,
    },

    /// `UniqueSlugifier` exhausted its attempt budget.
    #[error(
        "UniqueSlugifier exceeded {max} attempts for '{}'",
        truncate_error_text(text)
    )]
    UniqueSlugAttemptsExceeded {
        /// The attempt cap.
        max: u64,
        /// The input text that could not be made unique.
        text: String,
    },

    /// `UniqueSlugifier` `max_length` too small to ever produce a unique slug.
    #[error(
        "max_length={max_length} is too small to generate a unique slug with separator {separator:?}: \
         need at least {min_unique_len} bytes for the separator plus one counter digit"
    )]
    UniqueSlugMaxLengthTooSmall {
        /// The configured `max_length`.
        max_length: usize,
        /// The configured separator (rendered with `{:?}` — caller-controlled).
        separator: String,
        /// The minimum length required.
        min_unique_len: usize,
    },

    /// Explicit encoding label not recognised (`decode_to_utf8`).
    #[error("unknown encoding: '{got}'{suggestion}")]
    UnknownEncoding {
        /// The unrecognised label.
        got: String,
        /// Pre-rendered " (did you mean 'UTF-8'?)" hint, or empty (#186).
        suggestion: String,
    },

    /// Auto-detected encoding label is not supported (`decode_to_utf8`).
    #[error("auto-detected encoding '{got}' is not supported")]
    UnsupportedAutoEncoding {
        /// The detected-but-unsupported label.
        got: String,
    },

    /// Auto-detection confidence below the caller's threshold.
    #[error(
        "encoding detection confidence {confidence:.2} is below the required \
         minimum {min_confidence:.2} (best guess: '{guess}'); \
         provide an explicit encoding instead"
    )]
    EncodingConfidenceTooLow {
        /// The detection confidence.
        confidence: f64,
        /// The caller's required minimum.
        min_confidence: f64,
        /// The best-guess encoding label.
        guess: String,
    },

    /// `decode_to_utf8` was given a `min_confidence` outside the valid [0.0, 1.0]
    /// range. Validated in the core (the single source of truth) rather than only
    /// in the `_api.py` wrapper, so the raw `_decode_to_utf8` PyO3 entrypoint —
    /// which bypasses that wrapper — is held to the same contract. Maps to
    /// `InvalidArgumentError`.
    #[error("min_confidence must be between 0.0 and 1.0, got {min_confidence}")]
    MinConfidenceOutOfRange {
        /// The caller-supplied confidence threshold.
        min_confidence: f64,
    },

    /// Reverse transliteration not supported for the requested language. Maps to
    /// `UnsupportedError` (#183).
    #[error("reverse transliteration not supported for lang '{lang}'; available: {available}")]
    ReverseUnsupportedLang {
        /// The requested language.
        lang: String,
        /// Comma-joined list of supported languages.
        available: String,
    },

    /// `errors="strict"` hit a character with no transliteration (#184).
    #[error("no transliteration for {ch:?} (U+{:04X}) at byte offset {byte_offset}", *ch as u32)]
    Untranslatable {
        /// The offending character.
        ch: char,
        /// Its byte offset in the input.
        byte_offset: usize,
    },

    /// `decode_to_utf8(strict=True)` hit malformed input that would otherwise be
    /// silently replaced with U+FFFD (#189).
    #[error(
        "decoding as '{encoding}' replaced malformed or invalid byte sequences \
         (lossy); pass strict=False to accept the lossy result and inspect had_errors"
    )]
    LossyDecode {
        /// The encoding that was decoded from.
        encoding: String,
    },

    /// `transliterate()`: `lang` (forward) and `target` (reverse) both set (#231).
    #[error("'lang' and 'target' are mutually exclusive")]
    LangTargetExclusive,

    /// `transliterate()`: `context` (forward-only) and `target` (reverse) both set (#231).
    #[error("'context' and 'target' are mutually exclusive")]
    ContextTargetExclusive,

    /// `transliterate()`: `tones` combined with `context` (#231).
    #[error(
        "'tones' cannot be used with 'context' — context-aware \
         transliteration does not produce toned pinyin"
    )]
    TonesWithContext,

    /// `transliterate()`: `errors="strict"` combined with `context` (#231).
    #[error(
        "errors='strict' cannot be used with 'context' — strict mode is \
         only available for context-free transliteration"
    )]
    StrictWithContext,

    /// `transliterate()`: forward-only parameters supplied alongside `target` (#231).
    #[error("forward-only parameters ({names}) cannot be used with 'target'")]
    ForwardOnlyWithTarget {
        /// Comma-joined, sorted list of the offending parameter names.
        names: String,
    },

    /// Negative `max_length` (slugify / sanitize_filename). The PyO3 entrypoints
    /// accept a signed integer and validate here so the raw functions are held to
    /// the same contract as the `_api.py` wrapper (#231).
    #[error("max_length must be non-negative, got {got}")]
    NegativeMaxLength {
        /// The offending value.
        got: i64,
    },

    /// Negative `max_graphemes` (grapheme_truncate) (#231).
    #[error("max_graphemes must be non-negative, got {got}")]
    NegativeMaxGraphemes {
        /// The offending value.
        got: i64,
    },

    /// `replacement` for `strip_log_injection` contains a character the call
    /// itself neutralizes — it would break the "no raw CR/LF/ESC in output" and
    /// idempotency guarantees.
    #[error(
        "replacement must not contain a character this call neutralizes \
         (found U+{codepoint:04X})"
    )]
    InvalidLogReplacement {
        /// The offending character's code point.
        codepoint: u32,
    },
}

impl ErrorRepr {
    /// Stable machine-readable error code for this variant.
    ///
    /// New metadata (#181): not currently surfaced to Python, so it changes no
    /// behaviour. Codes are stable identifiers, independent of the (mutable)
    /// human-readable message text. Wired into the Python surface in a follow-up
    /// (the boundary conversion will attach it); unused in non-test builds until
    /// then, hence `allow(dead_code)`.
    #[allow(dead_code)]
    pub(crate) fn code(&self) -> &'static str {
        match self {
            ErrorRepr::InvalidErrorMode { .. } => "invalid_error_mode",
            ErrorRepr::InvalidNormForm { .. } => "invalid_norm_form",
            ErrorRepr::InvalidPipelineNormForm { .. } => "invalid_pipeline_norm_form",
            ErrorRepr::InvalidEmojiStyle { .. } => "invalid_emoji_style",
            ErrorRepr::InvalidPlatform { .. } => "invalid_platform",
            ErrorRepr::InvalidTargetScript { .. } => "invalid_target_script",
            ErrorRepr::InvalidScheme { .. } => "invalid_scheme",
            ErrorRepr::InvalidUrlComponent { .. } => "invalid_url_component",
            ErrorRepr::InvalidReverseLang { .. } => "invalid_reverse_lang",
            ErrorRepr::InvalidLogReplacement { .. } => "invalid_log_replacement",
            ErrorRepr::UnknownLang { .. } => "unknown_lang",
            ErrorRepr::MutuallyExclusiveBare | ErrorRepr::MutuallyExclusivePipeline => {
                "mutually_exclusive"
            }
            ErrorRepr::BatchTooLarge { .. } => "batch_too_large",
            ErrorRepr::ReplacementOutputTooLarge { .. } => "replacement_output_too_large",
            ErrorRepr::Sealed { .. } => "sealed",
            ErrorRepr::RegisterLangLimit { .. } => "register_lang_limit",
            ErrorRepr::RegisterLangBadKeys { .. } => "register_lang_bad_keys",
            ErrorRepr::RegisterReplacementsLimit { .. } => "register_replacements_limit",
            ErrorRepr::ContextDictNotFound { .. } => "context_dict_not_found",
            ErrorRepr::ContextDictCorrupt { .. } => "context_dict_corrupt",
            ErrorRepr::RegexTooLong { .. } => "regex_too_long",
            ErrorRepr::RegexCompile { .. } => "regex_compile",
            ErrorRepr::UniqueSlugAttemptsExceeded { .. } => "unique_slug_attempts_exceeded",
            ErrorRepr::UniqueSlugMaxLengthTooSmall { .. } => "unique_slug_max_length_too_small",
            ErrorRepr::UnknownEncoding { .. } => "unknown_encoding",
            ErrorRepr::UnsupportedAutoEncoding { .. } => "unsupported_auto_encoding",
            ErrorRepr::EncodingConfidenceTooLow { .. } => "encoding_confidence_too_low",
            ErrorRepr::MinConfidenceOutOfRange { .. } => "min_confidence_out_of_range",
            ErrorRepr::ReverseUnsupportedLang { .. } => "reverse_unsupported_lang",
            ErrorRepr::Untranslatable { .. } => "untranslatable",
            ErrorRepr::LossyDecode { .. } => "lossy_decode",
            ErrorRepr::LangTargetExclusive => "lang_target_exclusive",
            ErrorRepr::ContextTargetExclusive => "context_target_exclusive",
            ErrorRepr::TonesWithContext => "tones_with_context",
            ErrorRepr::StrictWithContext => "strict_with_context",
            ErrorRepr::ForwardOnlyWithTarget { .. } => "forward_only_with_target",
            ErrorRepr::NegativeMaxLength { .. } => "negative_max_length",
            ErrorRepr::NegativeMaxGraphemes { .. } => "negative_max_graphemes",
        }
    }
}

/// Convert a caller-supplied signed `max_length` to `usize`, rejecting negatives
/// in the core (#231).
///
/// The PyO3 entrypoints accept a *signed* integer so a negative value raises
/// disarm's `InvalidArgumentError` (the documented contract) instead of
/// PyO3's `OverflowError` from a silent unsigned conversion. On 64-bit targets
/// the only `try_from` failure is a negative value.
pub(crate) fn checked_max_length(value: i64) -> Result<usize, ErrorRepr> {
    usize::try_from(value).map_err(|_| ErrorRepr::NegativeMaxLength { got: value })
}

/// Convert a caller-supplied signed `max_graphemes` to `usize`, rejecting
/// negatives in the core (#231). See [`checked_max_length`].
pub(crate) fn checked_max_graphemes(value: i64) -> Result<usize, ErrorRepr> {
    usize::try_from(value).map_err(|_| ErrorRepr::NegativeMaxGraphemes { got: value })
}

#[cfg(feature = "extension-module")]
impl From<ErrorRepr> for pyo3::PyErr {
    /// Convert a core `ErrorRepr` into a Python exception from disarm's unified
    /// hierarchy (#183): a [`crate::DisarmError`] base with `InvalidArgumentError`
    /// / `ResourceLimitError` / `UnsupportedError` subclasses. Every variant maps
    /// to exactly one of them, so `except DisarmError` catches all of them —
    /// including the five sites that were previously bare `PyValueError`
    /// (mutually-exclusive flags, register limits, reverse unsupported lang) and
    /// silently escaped it. `DisarmError` remains a `ValueError` subclass, so
    /// existing `except ValueError` code is unaffected. The message is `Display`,
    /// identical to before at every site.
    ///
    /// The match is exhaustive (no wildcard) on purpose: a new variant must be
    /// assigned a category here, not silently default.
    fn from(err: ErrorRepr) -> Self {
        let msg = err.to_string();

        // #188: surface the underlying error as a chained `__cause__` for the
        // variants that wrap one, instead of only flattening it into the message.
        // Peeked by reference before `err` is consumed by the category match;
        // attached under the GIL afterwards. Python can't hold the Rust error
        // type, so the cause carries its rendered text in a `ValueError`.
        let cause: Option<pyo3::PyErr> = match &err {
            ErrorRepr::RegexCompile { source, .. } => {
                Some(pyo3::exceptions::PyValueError::new_err(source.to_string()))
            }
            ErrorRepr::ContextDictCorrupt { reason, .. } => {
                Some(pyo3::exceptions::PyValueError::new_err(reason.clone()))
            }
            _ => None,
        };

        let err_py = match err {
            // InvalidArgumentError — a bad argument value or a contradictory
            // combination of arguments.
            ErrorRepr::InvalidErrorMode { .. }
            | ErrorRepr::InvalidNormForm { .. }
            | ErrorRepr::InvalidPipelineNormForm { .. }
            | ErrorRepr::InvalidEmojiStyle { .. }
            | ErrorRepr::InvalidPlatform { .. }
            | ErrorRepr::InvalidTargetScript { .. }
            | ErrorRepr::InvalidScheme { .. }
            | ErrorRepr::InvalidUrlComponent { .. }
            | ErrorRepr::InvalidReverseLang { .. }
            | ErrorRepr::InvalidLogReplacement { .. }
            | ErrorRepr::UnknownLang { .. }
            | ErrorRepr::MutuallyExclusiveBare
            | ErrorRepr::MutuallyExclusivePipeline
            | ErrorRepr::RegisterLangBadKeys { .. }
            | ErrorRepr::RegexCompile { .. }
            | ErrorRepr::UniqueSlugMaxLengthTooSmall { .. }
            | ErrorRepr::UnknownEncoding { .. }
            | ErrorRepr::MinConfidenceOutOfRange { .. }
            | ErrorRepr::LangTargetExclusive
            | ErrorRepr::ContextTargetExclusive
            | ErrorRepr::TonesWithContext
            | ErrorRepr::StrictWithContext
            | ErrorRepr::ForwardOnlyWithTarget { .. }
            | ErrorRepr::NegativeMaxLength { .. }
            | ErrorRepr::NegativeMaxGraphemes { .. } => crate::InvalidArgumentError::new_err(msg),

            // ResourceLimitError — a configured limit was exceeded.
            ErrorRepr::BatchTooLarge { .. }
            | ErrorRepr::ReplacementOutputTooLarge { .. }
            | ErrorRepr::RegisterLangLimit { .. }
            | ErrorRepr::RegisterReplacementsLimit { .. }
            | ErrorRepr::RegexTooLong { .. }
            | ErrorRepr::UniqueSlugAttemptsExceeded { .. } => {
                crate::ResourceLimitError::new_err(msg)
            }

            // UnsupportedError — a requested feature is unavailable.
            ErrorRepr::UnsupportedAutoEncoding { .. }
            | ErrorRepr::ReverseUnsupportedLang { .. } => crate::UnsupportedError::new_err(msg),

            // Base DisarmError — state / data errors that fit no category above.
            ErrorRepr::Sealed { .. }
            | ErrorRepr::ContextDictNotFound { .. }
            | ErrorRepr::ContextDictCorrupt { .. }
            | ErrorRepr::EncodingConfidenceTooLow { .. }
            | ErrorRepr::Untranslatable { .. }
            | ErrorRepr::LossyDecode { .. } => crate::DisarmError::new_err(msg),
        };

        if let Some(cause_err) = cause {
            pyo3::Python::attach(|py| err_py.set_cause(py, Some(cause_err)));
        }
        err_py
    }
}

// ── Public opaque error (#38) ────────────────────────────────────────────────

/// The error type returned by disarm's fallible Rust API ([`crate::api`]).
///
/// Opaque by design: the internal `ErrorRepr` variants are an implementation
/// detail and may change without a breaking release. Callers branch on the
/// coarse, stable [`Error::kind`] (or read the stable [`Error::code`] / the
/// `Display` message), never on a concrete variant. `source()` deliberately
/// returns `None` so the wrapped `regex::Error` (and any future inner error)
/// does not leak into the public contract.
#[derive(Debug)]
pub struct Error(ErrorRepr);

/// Coarse, stable classification of an [`Error`](crate::Error) (#38).
///
/// This is the part of the error contract callers may rely on. The set is
/// `#[non_exhaustive]`: a future release may add a kind, so always include a
/// wildcard arm when matching.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum ErrorKind {
    /// A bad argument value or a contradictory combination of arguments.
    InvalidArgument,
    /// A configured limit was exceeded.
    ResourceLimit,
    /// A requested feature or resource is unavailable.
    Unsupported,
    /// A residual state/data error fitting no category above. Unstable: errors
    /// here may be reclassified into a more specific kind in a future release.
    Other,
}

impl Error {
    /// The coarse, stable [`ErrorKind`] of this error.
    ///
    /// The partition mirrors the Python exception hierarchy (`InvalidArgumentError`
    /// / `ResourceLimitError` / `UnsupportedError` / `DisarmError`).
    pub fn kind(&self) -> ErrorKind {
        match &self.0 {
            ErrorRepr::InvalidErrorMode { .. }
            | ErrorRepr::InvalidNormForm { .. }
            | ErrorRepr::InvalidPipelineNormForm { .. }
            | ErrorRepr::InvalidEmojiStyle { .. }
            | ErrorRepr::InvalidPlatform { .. }
            | ErrorRepr::InvalidTargetScript { .. }
            | ErrorRepr::InvalidScheme { .. }
            | ErrorRepr::InvalidUrlComponent { .. }
            | ErrorRepr::InvalidReverseLang { .. }
            | ErrorRepr::InvalidLogReplacement { .. }
            | ErrorRepr::UnknownLang { .. }
            | ErrorRepr::MutuallyExclusiveBare
            | ErrorRepr::MutuallyExclusivePipeline
            | ErrorRepr::RegisterLangBadKeys { .. }
            | ErrorRepr::RegexCompile { .. }
            | ErrorRepr::UniqueSlugMaxLengthTooSmall { .. }
            | ErrorRepr::UnknownEncoding { .. }
            | ErrorRepr::MinConfidenceOutOfRange { .. }
            | ErrorRepr::LangTargetExclusive
            | ErrorRepr::ContextTargetExclusive
            | ErrorRepr::TonesWithContext
            | ErrorRepr::StrictWithContext
            | ErrorRepr::ForwardOnlyWithTarget { .. }
            | ErrorRepr::NegativeMaxLength { .. }
            | ErrorRepr::NegativeMaxGraphemes { .. } => ErrorKind::InvalidArgument,

            ErrorRepr::BatchTooLarge { .. }
            | ErrorRepr::ReplacementOutputTooLarge { .. }
            | ErrorRepr::RegisterLangLimit { .. }
            | ErrorRepr::RegisterReplacementsLimit { .. }
            | ErrorRepr::RegexTooLong { .. }
            | ErrorRepr::UniqueSlugAttemptsExceeded { .. } => ErrorKind::ResourceLimit,

            ErrorRepr::UnsupportedAutoEncoding { .. }
            | ErrorRepr::ReverseUnsupportedLang { .. } => ErrorKind::Unsupported,

            ErrorRepr::Sealed { .. }
            | ErrorRepr::ContextDictNotFound { .. }
            | ErrorRepr::ContextDictCorrupt { .. }
            | ErrorRepr::EncodingConfidenceTooLow { .. }
            | ErrorRepr::Untranslatable { .. }
            | ErrorRepr::LossyDecode { .. } => ErrorKind::Other,
        }
    }

    /// A stable machine-readable code for this error, independent of the
    /// (mutable) human-readable `Display` message.
    pub fn code(&self) -> &'static str {
        self.0.code()
    }
}

impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        std::fmt::Display::fmt(&self.0, f)
    }
}

impl std::error::Error for Error {
    /// Always `None`: the opaque error never exposes its inner source (e.g. the
    /// wrapped `regex::Error`), keeping the public contract decoupled from
    /// internal dependencies.
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        None
    }
}

impl From<ErrorRepr> for Error {
    fn from(repr: ErrorRepr) -> Self {
        Error(repr)
    }
}

#[cfg(feature = "extension-module")]
impl From<Error> for pyo3::PyErr {
    fn from(err: Error) -> Self {
        err.0.into()
    }
}

#[cfg(test)]
mod tests {
    use super::{truncate_error_text, Error, ErrorKind, ErrorRepr, MAX_ERROR_INPUT_BYTES};

    #[test]
    fn truncate_error_text_bounds_long_input() {
        let long = "a".repeat(500);
        let out = truncate_error_text(&long);
        // Bounded to the cap plus the multi-byte ellipsis, never the full input.
        assert!(out.len() < long.len());
        assert!(out.starts_with("aaaa"));
        assert!(out.ends_with('…'));
        assert!(out.len() <= MAX_ERROR_INPUT_BYTES + '…'.len_utf8());
    }

    #[test]
    fn truncate_error_text_passes_short_input_through() {
        assert_eq!(truncate_error_text("short"), "short");
    }

    #[test]
    fn opaque_error_kind_partition_and_delegation() {
        // One representative per kind: the opaque `Error::kind()` partition must
        // agree with the `From<ErrorRepr> for PyErr` category mapping, and the
        // opaque wrapper must delegate `Display`/`code` to the inner repr.
        let cases = [
            (
                ErrorRepr::InvalidTargetScript { got: "x".into() },
                ErrorKind::InvalidArgument,
                "invalid_target_script",
            ),
            (
                ErrorRepr::BatchTooLarge { len: 2, max: 1 },
                ErrorKind::ResourceLimit,
                "batch_too_large",
            ),
            (
                ErrorRepr::ReverseUnsupportedLang {
                    lang: "zz".into(),
                    available: "ru".into(),
                },
                ErrorKind::Unsupported,
                "reverse_unsupported_lang",
            ),
            (
                ErrorRepr::Untranslatable {
                    ch: '\u{1F600}',
                    byte_offset: 0,
                },
                ErrorKind::Other,
                "untranslatable",
            ),
        ];
        for (repr, kind, code) in cases {
            let display = repr.to_string();
            let err = Error::from(repr);
            assert_eq!(err.kind(), kind, "kind for {code}");
            assert_eq!(err.code(), code);
            assert_eq!(err.to_string(), display, "Display delegates to repr");
            // source() never leaks an inner error (e.g. regex::Error).
            assert!(std::error::Error::source(&err).is_none());
        }
    }

    #[test]
    fn truncate_error_text_cuts_on_char_boundary() {
        // Multi-byte chars straddling the cap must not panic or split a char.
        let s = "é".repeat(100); // 200 bytes
        let out = truncate_error_text(&s);
        assert!(out.ends_with('…'));
        // The Owned branch produced valid UTF-8 (would have panicked otherwise).
        assert!(out.chars().count() > 1);
    }

    #[test]
    fn unique_slug_error_display_is_truncated() {
        let err = ErrorRepr::UniqueSlugAttemptsExceeded {
            max: 5,
            text: "x".repeat(500),
        };
        let msg = err.to_string();
        assert!(msg.starts_with("UniqueSlugifier exceeded 5 attempts for 'xxxx"));
        assert!(msg.contains('…'));
        assert!(
            msg.len() < 200,
            "error message should be bounded, got {} bytes",
            msg.len()
        );
    }

    /// `code()` is stable and exhaustive: every variant returns a non-empty,
    /// SCREAMING-or-snake identifier. This also keeps `code()` exercised (it is
    /// not yet wired into the Python surface).
    #[test]
    fn codes_are_nonempty_snake_case() {
        // Intentionally invalid pattern: we only need a real `regex::Error`
        // value to populate the `RegexCompile` sample below. Hoisted to a `let`
        // so the `#[allow]` can scope the lint (newer stable clippy's
        // `invalid_regex` flags the literal `"["`; expression-level attributes
        // are not stable inside the array literal).
        #[allow(clippy::invalid_regex)]
        let invalid_regex_err = regex::Regex::new("[").unwrap_err();
        let samples = [
            ErrorRepr::InvalidErrorMode { got: "x".into() },
            ErrorRepr::InvalidNormForm { got: "x".into() },
            ErrorRepr::InvalidPipelineNormForm { got: "x".into() },
            ErrorRepr::InvalidEmojiStyle { got: "x".into() },
            ErrorRepr::InvalidPlatform { got: "x".into() },
            ErrorRepr::InvalidTargetScript { got: "x".into() },
            ErrorRepr::InvalidLogReplacement { codepoint: 0x0A },
            ErrorRepr::UnknownLang {
                got: "x".into(),
                suggestion: String::new(),
                valid: "a, b".into(),
            },
            ErrorRepr::MutuallyExclusiveBare,
            ErrorRepr::MutuallyExclusivePipeline,
            ErrorRepr::BatchTooLarge { len: 2, max: 1 },
            ErrorRepr::ReplacementOutputTooLarge { size: 2, max: 1 },
            ErrorRepr::Sealed {
                op: "register".into(),
            },
            ErrorRepr::RegisterLangLimit { max: 1 },
            ErrorRepr::RegisterLangBadKeys {
                keys: "\"x\"".into(),
            },
            ErrorRepr::RegisterReplacementsLimit {
                max: 1,
                projected: 2,
            },
            ErrorRepr::ContextDictNotFound {
                lang: "Arabic".into(),
            },
            ErrorRepr::ContextDictCorrupt {
                lang: "Arabic".into(),
                reason: "bad".into(),
            },
            ErrorRepr::RegexTooLong { len: 2, max: 1 },
            ErrorRepr::RegexCompile {
                pattern: "[".into(),
                source: invalid_regex_err,
            },
            ErrorRepr::UniqueSlugAttemptsExceeded {
                max: 1,
                text: "x".into(),
            },
            ErrorRepr::UniqueSlugMaxLengthTooSmall {
                max_length: 1,
                separator: "-".into(),
                min_unique_len: 2,
            },
            ErrorRepr::UnknownEncoding {
                got: "x".into(),
                suggestion: String::new(),
            },
            ErrorRepr::UnsupportedAutoEncoding { got: "x".into() },
            ErrorRepr::EncodingConfidenceTooLow {
                confidence: 0.5,
                min_confidence: 0.9,
                guess: "UTF-8".into(),
            },
            ErrorRepr::ReverseUnsupportedLang {
                lang: "de".into(),
                available: "ru, uk".into(),
            },
            ErrorRepr::Untranslatable {
                ch: '😀',
                byte_offset: 3,
            },
            ErrorRepr::LossyDecode {
                encoding: "Shift_JIS".into(),
            },
        ];
        for e in &samples {
            let code = e.code();
            assert!(!code.is_empty());
            assert!(
                code.bytes().all(|b| b.is_ascii_lowercase() || b == b'_'),
                "code {code:?} is not lower snake_case"
            );
        }
    }

    /// Every message follows the house style (#187): non-empty, lowercase-first
    /// (bar identifiers), single-quoted values, and echoes the offending value.
    #[test]
    fn messages_follow_house_style() {
        // Identifiers permitted to begin a message with an uppercase letter.
        const ALLOWED_UPPERCASE: &[&str] = &["UniqueSlugifier"];
        // Variants that render arbitrary user content with `{:?}` (double quotes).
        fn allows_double_quote(e: &ErrorRepr) -> bool {
            matches!(
                e,
                ErrorRepr::RegexCompile { .. }
                    | ErrorRepr::RegisterLangBadKeys { .. }
                    | ErrorRepr::UniqueSlugMaxLengthTooSmall { .. }
            )
        }

        // Lowercase so it doesn't trip the lowercase-first check when a message
        // (e.g. Sealed) begins with the echoed value itself.
        const MARKER: &str = "zzvaluezz";
        #[allow(clippy::invalid_regex)]
        let regex_err = regex::Regex::new("[").unwrap_err();
        // (sample, does the message echo MARKER?)
        let samples: Vec<(ErrorRepr, bool)> = vec![
            (ErrorRepr::InvalidErrorMode { got: MARKER.into() }, true),
            (ErrorRepr::InvalidNormForm { got: MARKER.into() }, true),
            (
                ErrorRepr::InvalidPipelineNormForm { got: MARKER.into() },
                true,
            ),
            (ErrorRepr::InvalidEmojiStyle { got: MARKER.into() }, true),
            (ErrorRepr::InvalidPlatform { got: MARKER.into() }, true),
            (ErrorRepr::InvalidTargetScript { got: MARKER.into() }, true),
            (
                ErrorRepr::UnknownLang {
                    got: MARKER.into(),
                    suggestion: String::new(),
                    valid: "a, b".into(),
                },
                true,
            ),
            (ErrorRepr::MutuallyExclusiveBare, false),
            (ErrorRepr::MutuallyExclusivePipeline, false),
            (ErrorRepr::BatchTooLarge { len: 2, max: 1 }, false),
            (
                ErrorRepr::ReplacementOutputTooLarge { size: 2, max: 1 },
                false,
            ),
            (ErrorRepr::Sealed { op: MARKER.into() }, true),
            (ErrorRepr::RegisterLangLimit { max: 1 }, false),
            // keys are rendered with {:?} (arbitrary content) — a double-quote exception.
            (
                ErrorRepr::RegisterLangBadKeys {
                    keys: format!("{MARKER:?}"),
                },
                true,
            ),
            (
                ErrorRepr::RegisterReplacementsLimit {
                    max: 1,
                    projected: 2,
                },
                false,
            ),
            (
                ErrorRepr::ContextDictNotFound {
                    lang: MARKER.into(),
                },
                true,
            ),
            (
                ErrorRepr::ContextDictCorrupt {
                    lang: MARKER.into(),
                    reason: "bad".into(),
                },
                true,
            ),
            (ErrorRepr::RegexTooLong { len: 2, max: 1 }, false),
            (
                ErrorRepr::RegexCompile {
                    pattern: MARKER.into(),
                    source: regex_err,
                },
                true,
            ),
            (
                ErrorRepr::UniqueSlugAttemptsExceeded {
                    max: 1,
                    text: MARKER.into(),
                },
                true,
            ),
            (
                ErrorRepr::UniqueSlugMaxLengthTooSmall {
                    max_length: 1,
                    separator: MARKER.into(),
                    min_unique_len: 2,
                },
                true,
            ),
            (
                ErrorRepr::UnknownEncoding {
                    got: MARKER.into(),
                    suggestion: String::new(),
                },
                true,
            ),
            (
                ErrorRepr::UnsupportedAutoEncoding { got: MARKER.into() },
                true,
            ),
            (
                ErrorRepr::EncodingConfidenceTooLow {
                    confidence: 0.5,
                    min_confidence: 0.9,
                    guess: MARKER.into(),
                },
                true,
            ),
            (
                ErrorRepr::MinConfidenceOutOfRange {
                    min_confidence: 1.5,
                },
                false,
            ),
            (
                ErrorRepr::ReverseUnsupportedLang {
                    lang: MARKER.into(),
                    available: "ru".into(),
                },
                true,
            ),
            (
                ErrorRepr::Untranslatable {
                    ch: '😀',
                    byte_offset: 3,
                },
                false,
            ),
            (
                ErrorRepr::LossyDecode {
                    encoding: MARKER.into(),
                },
                true,
            ),
        ];

        for (err, echoes_marker) in &samples {
            let msg = err.to_string();
            let code = err.code();
            assert!(!msg.is_empty(), "empty message for {code}");

            let first = msg.chars().next().unwrap();
            let upper_ok = ALLOWED_UPPERCASE.iter().any(|p| msg.starts_with(p));
            assert!(
                !first.is_alphabetic() || first.is_lowercase() || upper_ok,
                "{code}: message should start lowercase (or a known identifier): {msg:?}"
            );

            if !allows_double_quote(err) {
                assert!(
                    !msg.contains('"'),
                    "{code}: use single quotes, not double quotes: {msg:?}"
                );
            }

            if *echoes_marker {
                assert!(
                    msg.contains(MARKER),
                    "{code}: message must echo the offending value: {msg:?}"
                );
            }
        }
    }

    /// The "invalid argument value" family must always guide the caller to the
    /// valid set (#187): the message names the expected options or a remedy.
    #[test]
    fn invalid_value_messages_offer_a_remedy() {
        let samples = [
            ErrorRepr::InvalidErrorMode { got: "x".into() },
            ErrorRepr::InvalidNormForm { got: "x".into() },
            ErrorRepr::InvalidPipelineNormForm { got: "x".into() },
            ErrorRepr::InvalidEmojiStyle { got: "x".into() },
            ErrorRepr::InvalidPlatform { got: "x".into() },
            ErrorRepr::InvalidTargetScript { got: "x".into() },
            ErrorRepr::UnknownLang {
                got: "x".into(),
                suggestion: String::new(),
                valid: "a, b".into(),
            },
            ErrorRepr::MinConfidenceOutOfRange {
                min_confidence: 1.5,
            },
        ];
        for e in &samples {
            let msg = e.to_string();
            assert!(
                msg.contains("must be") || msg.contains("expected") || msg.contains("one of"),
                "{}: should name the valid options: {msg:?}",
                e.code()
            );
        }
    }

    /// The two mutually-exclusive variants intentionally share one code and one
    /// message but map to *different* Python exception types (preserved, #181).
    #[test]
    fn mutually_exclusive_variants_share_text_and_code() {
        let a = ErrorRepr::MutuallyExclusiveBare;
        let b = ErrorRepr::MutuallyExclusivePipeline;
        assert_eq!(a.to_string(), b.to_string());
        assert_eq!(a.code(), b.code());
        assert_eq!(
            a.to_string(),
            "strict_iso9 and gost7034 are mutually exclusive"
        );
    }
}