vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
//! Offset and span classifiers for encoded text, network identities, and secrets.

use super::bytes::{
    find_bytes, is_base64_char, is_boundary, is_email_domain, is_email_local, is_uri_char,
    starts_ci,
};
use super::decode::{parse_ipv4, parse_ipv6, parse_jwt};
use super::{to_u32, validate_input, DetectionError};

/// Public offset-length span returned by detectors.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ByteSpan {
    /// Inclusive byte offset where the match starts.
    pub offset: u32,
    /// Match length in bytes.
    pub len: u32,
}

/// Build a checked public span.
///
/// # Errors
///
/// Returns `Fix: ...` when offset or length cannot fit the public ABI.
pub fn span(offset: usize, len: usize) -> Result<ByteSpan, DetectionError> {
    Ok(ByteSpan {
        offset: to_u32(offset, "offset")?,
        len: to_u32(len, "length")?,
    })
}

/// Advance while a byte predicate matches.
#[must_use]
pub fn advance_while(input: &[u8], mut index: usize, pred: fn(u8) -> bool) -> usize {
    while index < input.len() && pred(input[index]) {
        index += 1;
    }
    index
}

/// Rewind while a byte predicate matches.
#[must_use]
pub fn rewind_while(input: &[u8], mut index: usize, pred: fn(u8) -> bool) -> usize {
    while index > 0 && pred(input[index - 1]) {
        index -= 1;
    }
    index
}

/// Push a run start if it meets a minimum length.
///
/// # Errors
///
/// Returns `Fix: ...` when the offset cannot fit the public ABI.
pub fn push_run(
    offsets: &mut Vec<u32>,
    start: usize,
    end: usize,
    min: usize,
) -> Result<(), DetectionError> {
    if end.saturating_sub(start) >= min {
        offsets.push(to_u32(start, "offset")?);
    }
    Ok(())
}

/// Locate contiguous runs matching a predicate as `[start, end]` u32 pairs.
///
/// # Errors
///
/// Returns `Fix: ...` when input is too large or offsets exceed the ABI.
pub fn run_offsets(
    input: &[u8],
    min_run_len: u32,
    predicate: fn(&u8) -> bool,
) -> Result<Vec<u32>, DetectionError> {
    validate_input(input)?;
    let min = min_run_len as usize;
    let mut out = Vec::new();
    let mut start = None;
    for (i, b) in input.iter().enumerate() {
        match (predicate(b), start) {
            (true, None) => start = Some(i),
            (false, Some(s)) => {
                if i - s >= min {
                    out.push(to_u32(s, "run_start")?);
                    out.push(to_u32(i, "run_end")?);
                }
                start = None;
            }
            _ => {}
        }
    }
    if let Some(s) = start {
        if input.len() - s >= min {
            out.push(to_u32(s, "run_start")?);
            out.push(to_u32(input.len(), "run_end")?);
        }
    }
    Ok(out)
}

/// Return base64-shaped run offsets.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or offset conversion fails.
pub fn base64_run_offsets(input: &[u8], min_run_len: u32) -> Result<Vec<u32>, DetectionError> {
    run_offsets(input, min_run_len, is_base64_char)
}

/// Return hexadecimal run offsets.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or offset conversion fails.
pub fn hex_run_offsets(input: &[u8], min_run_len: u32) -> Result<Vec<u32>, DetectionError> {
    run_offsets(input, min_run_len, u8::is_ascii_hexdigit)
}

/// Return URL-like spans.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or span conversion fails.
pub fn url_spans(input: &[u8]) -> Result<Vec<ByteSpan>, DetectionError> {
    validate_input(input)?;
    let mut spans = Vec::new();
    let mut index = 0usize;
    while index < input.len() {
        let scheme_len = if starts_ci(input, index, b"http://") {
            7
        } else if starts_ci(input, index, b"https://") {
            8
        } else {
            index += 1;
            continue;
        };
        if !is_boundary(index.checked_sub(1).and_then(|i| input.get(i).copied())) {
            index += 1;
            continue;
        }
        let mut end = index + scheme_len;
        let host_start = end;
        while end < input.len() && is_uri_char(input[end]) {
            end += 1;
        }
        if end > host_start && input[host_start..end].contains(&b'.') {
            spans.push(span(index, end - index)?);
        }
        index = end.max(index + 1);
    }
    Ok(spans)
}

/// Return IPv4 spans with decimal octet validation.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or span conversion fails.
pub fn ipv4_spans(input: &[u8]) -> Result<Vec<ByteSpan>, DetectionError> {
    validate_input(input)?;
    let mut out = Vec::new();
    for start in 0..input.len() {
        if !is_boundary(start.checked_sub(1).and_then(|i| input.get(i).copied())) {
            continue;
        }
        if let Some(len) = parse_ipv4(&input[start..]) {
            if is_boundary(input.get(start + len).copied()) {
                out.push(span(start, len)?);
            }
        }
    }
    Ok(out)
}

/// Return IPv6 spans with hex group validation.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or span conversion fails.
pub fn ipv6_spans(input: &[u8]) -> Result<Vec<ByteSpan>, DetectionError> {
    validate_input(input)?;
    let mut out = Vec::new();
    for start in 0..input.len() {
        if !is_boundary(start.checked_sub(1).and_then(|i| input.get(i).copied())) {
            continue;
        }
        if let Some(len) = parse_ipv6(&input[start..]) {
            if len >= 3 && is_boundary(input.get(start + len).copied()) {
                out.push(span(start, len)?);
            }
        }
    }
    Ok(out)
}

/// Return email address spans.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or span conversion fails.
pub fn email_spans(input: &[u8]) -> Result<Vec<ByteSpan>, DetectionError> {
    validate_input(input)?;
    let mut out = Vec::new();
    for at in 1..input.len().saturating_sub(1) {
        if input[at] != b'@' {
            continue;
        }
        let start = rewind_while(input, at, is_email_local);
        let end = advance_while(input, at + 1, is_email_domain);
        if start < at
            && end > at + 3
            && input[at + 1..end].contains(&b'.')
            && is_boundary(start.checked_sub(1).and_then(|i| input.get(i).copied()))
            && is_boundary(input.get(end).copied())
        {
            out.push(span(start, end - start)?);
        }
    }
    Ok(out)
}

/// Return true when bytes match canonical UUID syntax.
#[must_use]
pub fn matches_uuid(bytes: &[u8]) -> bool {
    for (index, &byte) in bytes.iter().enumerate() {
        if matches!(index, 8 | 13 | 18 | 23) {
            if byte != b'-' {
                return false;
            }
        } else if !byte.is_ascii_hexdigit() {
            return false;
        }
    }
    true
}

/// Return canonical UUID spans.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or span conversion fails.
pub fn uuid_spans(input: &[u8]) -> Result<Vec<ByteSpan>, DetectionError> {
    validate_input(input)?;
    let mut out = Vec::new();
    for start in 0..input.len().saturating_sub(35) {
        if matches_uuid(&input[start..start + 36])
            && is_boundary(start.checked_sub(1).and_then(|i| input.get(i).copied()))
            && is_boundary(input.get(start + 36).copied())
        {
            out.push(span(start, 36)?);
        }
    }
    Ok(out)
}

/// Return JWT-like token spans.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or span conversion fails.
pub fn jwt_spans(input: &[u8]) -> Result<Vec<ByteSpan>, DetectionError> {
    validate_input(input)?;
    let mut out = Vec::new();
    for start in 0..input.len() {
        if !is_boundary(start.checked_sub(1).and_then(|i| input.get(i).copied())) {
            continue;
        }
        if let Some(len) = parse_jwt(&input[start..]) {
            if is_boundary(input.get(start + len).copied()) {
                out.push(span(start, len)?);
            }
        }
    }
    Ok(out)
}

/// Return PEM armor block spans with matching BEGIN and END labels.
///
/// # Errors
///
/// Returns `Fix: ...` when input validation or span conversion fails.
pub fn pem_spans(input: &[u8]) -> Result<Vec<ByteSpan>, DetectionError> {
    validate_input(input)?;
    let mut out = Vec::new();
    let mut index = 0usize;
    while let Some(begin_rel) = find_bytes(&input[index..], b"-----BEGIN ") {
        let begin = index + begin_rel;
        let label_start = begin + 11;
        let Some(label_end_rel) = find_bytes(&input[label_start..], b"-----") else {
            break;
        };
        let label_end = label_start + label_end_rel;
        let label = &input[label_start..label_end];
        if label.is_empty() || !label.iter().all(|b| b.is_ascii_uppercase() || *b == b' ') {
            index = label_start;
            continue;
        }
        let mut end_marker = b"-----END ".to_vec();
        end_marker.extend_from_slice(label);
        end_marker.extend_from_slice(b"-----");
        if let Some(end_rel) = find_bytes(&input[label_end..], &end_marker) {
            let end = label_end + end_rel + end_marker.len();
            out.push(span(begin, end - begin)?);
            index = end;
        } else {
            index = label_end;
        }
    }
    Ok(out)
}