iriq 0.30.2

IRI/URL extraction, normalization, and shape clustering.
Documentation
use crate::identifier::Identifier;
use crate::parser::parse;
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::HashSet;

pub struct Extractor {
    pub scheme_less: bool,
}

impl Extractor {
    pub fn new() -> Self {
        Extractor { scheme_less: true }
    }

    pub fn extract(&self, text: &str) -> Vec<Identifier> {
        if text.is_empty() {
            return Vec::new();
        }
        let pattern: &Regex = if self.scheme_less {
            &COMBINED_RE
        } else {
            &CANDIDATE_RE
        };
        let mut out = Vec::new();
        for m in pattern.find_iter(text) {
            if !left_boundary_ok(text, m.start(), self.scheme_less) {
                continue;
            }
            let candidate = m.as_str();
            let trimmed = trim_candidate(candidate);
            if trimmed.is_empty() {
                continue;
            }
            if let Ok(iri) = parse(&trimmed) {
                out.push(iri);
            }
        }
        out
    }

    pub fn extract_strings(&self, text: &str) -> Vec<String> {
        let mut seen = HashSet::new();
        let mut out = Vec::new();
        for iri in self.extract(text) {
            let c = iri.canonical();
            if seen.insert(c.clone()) {
                out.push(c);
            }
        }
        out
    }
}

impl Default for Extractor {
    fn default() -> Self {
        Self::new()
    }
}

const SCHEMES: &[&str] = &["https", "http", "ftp", "wss", "ws"];
const SCHEMELESS_TLDS: &[&str] = &[
    "com", "org", "net", "io", "ai", "dev", "co", "app", "gov", "edu",
];

const NON_ASCII_BOUNDARY: &str =
    "」』)】〉》〕〗〙〛⦆}]>「『(【〈《〔〖〘〚⦅{[<“”‘’„‟‚«»‹›";

fn url_chars_class() -> String {
    let mut escaped = String::new();
    let mut seen: HashSet<char> = HashSet::new();
    for c in NON_ASCII_BOUNDARY.chars() {
        if !seen.insert(c) {
            continue;
        }
        if matches!(c, ']' | '\\' | '^' | '-') {
            escaped.push('\\');
        }
        escaped.push(c);
    }
    format!(r#"[^\s<>"'`,{}]+"#, escaped)
}

static CANDIDATE_RE: Lazy<Regex> = Lazy::new(|| {
    let url_chars = url_chars_class();
    let pat = format!(
        r"(?:(?i:{schemes})://{u}|urn:[a-zA-Z0-9][a-zA-Z0-9\-]{{0,30}}:{u})",
        schemes = SCHEMES.join("|"),
        u = url_chars,
    );
    Regex::new(&pat).unwrap()
});

static COMBINED_RE: Lazy<Regex> = Lazy::new(|| {
    let url_chars = url_chars_class();
    let pat = format!(
        r"(?:(?i:{schemes})://{u}|urn:[a-zA-Z0-9][a-zA-Z0-9\-]{{0,30}}:{u}|(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{{0,61}}[a-zA-Z0-9])?\.)+(?i:{tlds})/{u})",
        schemes = SCHEMES.join("|"),
        tlds = SCHEMELESS_TLDS.join("|"),
        u = url_chars,
    );
    Regex::new(&pat).unwrap()
});

static TRAILING_PUNCT_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"[.,;:!?'"\u{2018}\u{2019}\u{201C}\u{201D}]+$"#).unwrap());

fn bracket_pair(c: char) -> Option<char> {
    match c {
        ')' => Some('('),
        ']' => Some('['),
        '}' => Some('{'),
        _ => None,
    }
}

fn left_boundary_ok(text: &str, start: usize, schemeless: bool) -> bool {
    if start == 0 {
        return true;
    }
    let prev = text[..start].chars().last();
    let Some(r) = prev else {
        return true;
    };
    if is_word(r) {
        return false;
    }
    if r == '/' {
        return false;
    }
    if schemeless && (r == '.' || r == '@') {
        return false;
    }
    true
}

fn is_word(c: char) -> bool {
    c == '_' || c.is_alphabetic() || c.is_numeric() || matches!(c, '\u{0300}'..='\u{036F}')
}

fn trim_candidate(candidate: &str) -> String {
    let mut s = candidate.to_string();
    loop {
        let before = s.clone();
        s = TRAILING_PUNCT_RE.replace(&s, "").into_owned();
        for close in [')', ']', '}'] {
            let open = bracket_pair(close).unwrap();
            while !s.is_empty() && s.ends_with(close) {
                let close_count = s.chars().filter(|&c| c == close).count();
                let open_count = s.chars().filter(|&c| c == open).count();
                if close_count > open_count {
                    let mut chars = s.chars();
                    chars.next_back();
                    s = chars.as_str().to_string();
                } else {
                    break;
                }
            }
        }
        if s == before {
            return s;
        }
    }
}