rtl_isolate_fix 0.1.0

//! rtl_isolate_fix
//!
//! هدف: اصلاح متن RTL در قالب **Plain Text** با تزریق Unicode isolates:
//! - LRI (U+2066)  Left-to-Right Isolate
//! - PDI (U+2069)  Pop Directional Isolate
//!
//! این کار باعث می‌شود توکن‌های فنی LTR مثل `C#` در متن فارسی کمتر دچار وارونگی دیداری شوند.
//! تصمیم‌های پروژه:
//! - URL و Email ایزوله نمی‌شوند.
//! - متن خروجی decorated است (خود متن شامل isolates می‌شود).

use std::borrow::Cow;

pub const LRI: char = '\u{2066}'; // Left-to-Right Isolate
pub const PDI: char = '\u{2069}'; // Pop Directional Isolate

/// تنظیمات الگوریتم decorate.
#[derive(Clone, Debug)]
pub struct FixConfig {
    /// اگر true باشد، URL ها ایزوله نمی‌شوند.
    pub exclude_urls: bool,
    /// اگر true باشد، Email ها ایزوله نمی‌شوند.
    pub exclude_emails: bool,
    /// اگر true باشد، توکن‌هایی که فقط عدد هستند ایزوله نمی‌شوند.
    pub exclude_pure_numbers: bool,
}

impl Default for FixConfig {
    fn default() -> Self {
        Self {
            exclude_urls: true,
            exclude_emails: true,
            exclude_pure_numbers: true,
        }
    }
}

/// خروجی را با تزریق LRI…PDI برای توکن‌های فنی LTR decorate می‌کند.
///
/// این تابع **idempotent** طراحی شده: اگر متن قبلاً داخل isolate باشد، دوباره wrap نمی‌کند.
///
/// توجه:
/// - این تابع برای سناریوی «ابزار تایپ RTL» طراحی شده و در حالت RTL-mode
///   توکن‌های فنی را wrap می‌کند.
pub fn decorate_rtl_plain_text(input: &str, cfg: &FixConfig) -> String {
    if input.is_empty() {
        return String::new();
    }

    let mut out = String::with_capacity(input.len() + 16);

    // Track existing isolates so we don't double-wrap.
    let mut isolate_depth: i32 = 0;

    let bytes = input.as_bytes();
    let mut i: usize = 0;

    while i < bytes.len() {
        // Work at char boundaries.
        let (ch, ch_len) = next_char(input, i);

        // Existing isolates: LRI/RLI/FSI increase depth; PDI decreases.
        if ch == LRI || ch == '\u{2067}' || ch == '\u{2068}' {
            isolate_depth += 1;
            out.push(ch);
            i += ch_len;
            continue;
        }
        if ch == PDI {
            if isolate_depth > 0 {
                isolate_depth -= 1;
            }
            out.push(ch);
            i += ch_len;
            continue;
        }

        // When inside isolate, copy as-is.
        if isolate_depth > 0 {
            out.push(ch);
            i += ch_len;
            continue;
        }

        // Detect a technical token starting with ASCII alnum.
        if is_token_start(ch) {
            let start = i;
            let mut j = i + ch_len;

            while j < bytes.len() {
                let (c2, c2_len) = next_char(input, j);
                if is_token_continue(c2) {
                    j += c2_len;
                } else {
                    break;
                }
            }

            let token = &input[start..j];
            let (core, trailing) = split_trailing_punct(token);

            // Decide whether we should wrap.
            if should_wrap(core, cfg) {
                out.push(LRI);
                out.push_str(core);
                out.push(PDI);
            } else {
                out.push_str(core);
            }
            out.push_str(trailing);

            i = j;
            continue;
        }

        out.push(ch);
        i += ch_len;
    }

    out
}

/// حذف isolates از متن (برای جستجو/ایندکس/مقایسه رشته‌ای).
pub fn strip_isolates(input: &str) -> Cow<'_, str> {
    if input.is_empty() {
        return Cow::Borrowed(input);
    }

    let mut changed = false;
    for ch in input.chars() {
        if is_isolate_control(ch) {
            changed = true;
            break;
        }
    }
    if !changed {
        return Cow::Borrowed(input);
    }

    let mut out = String::with_capacity(input.len());
    for ch in input.chars() {
        if !is_isolate_control(ch) {
            out.push(ch);
        }
    }
    Cow::Owned(out)
}

/// برای دیباگ: نمایش کاراکترهای نامرئی.
///
/// - LRI را به `⟦LRI⟧` و PDI را به `⟦PDI⟧` تبدیل می‌کند.
pub fn visualize_invisibles(input: &str) -> String {
    if input.is_empty() {
        return String::new();
    }

    let mut out = String::with_capacity(input.len() + 32);
    for ch in input.chars() {
        match ch {
            LRI => out.push_str("⟦LRI⟧"),
            '\u{2067}' => out.push_str("⟦RLI⟧"),
            '\u{2068}' => out.push_str("⟦FSI⟧"),
            PDI => out.push_str("⟦PDI⟧"),
            '\u{200E}' => out.push_str("⟦LRM⟧"),
            '\u{200F}' => out.push_str("⟦RLM⟧"),
            _ => out.push(ch),
        }
    }
    out
}

// -------------------- internals --------------------

#[inline]
fn is_isolate_control(ch: char) -> bool {
    matches!(ch, '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}')
}

#[inline]
fn next_char(s: &str, byte_idx: usize) -> (char, usize) {
    let ch = s[byte_idx..].chars().next().unwrap();
    (ch, ch.len_utf8())
}

#[inline]
fn is_token_start(ch: char) -> bool {
    ch.is_ascii_alphanumeric()
}

#[inline]
fn is_token_continue(ch: char) -> bool {
    if ch.is_ascii_alphanumeric() {
        return true;
    }

    // Technical punctuation commonly used in identifiers, versions, paths, etc.
    matches!(
        ch,
        '#' | '@' | '.' | '/' | '\\' | ':' | '_' | '-' | '+' | '=' | '%' | '&' | '?' | '~'
    )
}

/// جدا کردن علائم پایانی که ممکن است بخشی از جمله باشند (مثلاً `C#.` یا `v1.2,`).
/// این کار کمک می‌کند URL/Email درست تشخیص داده شوند و نقطه‌ی انتهای جمله بیرون از isolate بماند.
fn split_trailing_punct(token: &str) -> (&str, &str) {
    // فقط علائم ASCII رایج را trim می‌کنیم؛ پرانتز/کروشه معمولاً داخل token نیست.
    let trim_set = ['.', ',', ';', '!', '?'];

    let mut end = token.len();
    while end > 0 {
        let prev = token[..end].chars().last().unwrap();
        if trim_set.contains(&prev) {
            end -= prev.len_utf8();
        } else {
            break;
        }
    }

    (&token[..end], &token[end..])
}

fn should_wrap(core: &str, cfg: &FixConfig) -> bool {
    if core.is_empty() {
        return false;
    }

    // Optional: don't wrap pure numbers.
    if cfg.exclude_pure_numbers && core.chars().all(|c| c.is_ascii_digit()) {
        return false;
    }

    // Exclusions.
    if cfg.exclude_urls && looks_like_url(core) {
        return false;
    }
    if cfg.exclude_emails && looks_like_email(core) {
        return false;
    }

    // Wrap if it looks like a technical token:
    // - has any ASCII letter
    // - OR contains common technical punctuation (# . / \\ : _ - + =)
    let mut has_letter = false;
    let mut has_tech_punct = false;

    for ch in core.chars() {
        if ch.is_ascii_alphabetic() {
            has_letter = true;
        }
        if matches!(ch, '#' | '.' | '/' | '\\' | ':' | '_' | '-' | '+' | '=') {
            has_tech_punct = true;
        }
    }

    has_letter || has_tech_punct
}

fn looks_like_url(s: &str) -> bool {
    let lower = s.as_bytes();

    // Very conservative URL detection. We exclude only obvious URLs.
    // - contains ://
    // - starts with www.
    if s.contains("://") {
        return true;
    }

    // starts_with("www.") case-insensitively
    if lower.len() >= 4 {
        if (lower[0] == b'w' || lower[0] == b'W')
            && (lower[1] == b'w' || lower[1] == b'W')
            && (lower[2] == b'w' || lower[2] == b'W')
            && lower[3] == b'.'
        {
            return true;
        }
    }

    // common schemes
    let s_l = s.to_ascii_lowercase();
    s_l.starts_with("http://") || s_l.starts_with("https://") || s_l.starts_with("ftp://")
}

fn looks_like_email(s: &str) -> bool {
    // Conservative email detection.
    // Must contain exactly one '@' and at least one '.' in the domain part.
    let at = match s.find('@') {
        Some(i) => i,
        None => return false,
    };
    if at == 0 || at + 1 >= s.len() {
        return false;
    }
    if s[at + 1..].contains('@') {
        return false; // more than one @
    }

    let (local, domain) = s.split_at(at);
    let domain = &domain[1..]; // skip '@'

    if local.is_empty() || domain.is_empty() {
        return false;
    }

    // domain must contain at least one '.' not at ends
    let dot = match domain.rfind('.') {
        Some(i) => i,
        None => return false,
    };
    if dot == 0 || dot + 1 >= domain.len() {
        return false;
    }

    // Allowed chars (ASCII only for conservative match)
    if !local.chars().all(is_email_local_char) {
        return false;
    }
    if !domain.chars().all(is_email_domain_char) {
        return false;
    }

    true
}

#[inline]
fn is_email_local_char(c: char) -> bool {
    c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '%' | '+' | '-')
}

#[inline]
fn is_email_domain_char(c: char) -> bool {
    c.is_ascii_alphanumeric() || matches!(c, '.' | '-')
}