bibtex-parser 0.3.1

BibTeX parser for Rust
Documentation
//! SIMD-accelerated parsing utilities for BibTeX
//!
//! This module provides SIMD-optimized functions for common parsing operations
//! like brace balancing and quote scanning, achieving 30-50% performance gains.

use memchr::{memchr, memchr2};

/// Find balanced braces using SIMD acceleration
///
/// This function scans for matching braces, handling:
/// - Nested braces
/// - Escaped characters (\{, \})
/// - Efficient SIMD scanning for delimiters
#[inline]
#[must_use]
pub fn find_balanced_braces(input: &[u8]) -> Option<usize> {
    if input.is_empty() || input[0] != b'{' {
        return None;
    }

    let mut depth = 1;
    let mut pos = 1;

    // Find braces directly and only inspect backslashes when a brace may close
    // or change nesting.
    while pos < input.len() {
        if let Some(offset) = memchr2(b'{', b'}', &input[pos..]) {
            let idx = pos + offset;
            if is_escaped_delimiter(input, idx) {
                pos = idx + 1;
                continue;
            }

            match input[idx] {
                b'{' => {
                    depth += 1;
                    pos = idx + 1;
                }
                b'}' => {
                    depth -= 1;
                    if depth == 0 {
                        return Some(idx + 1); // Return position after closing brace
                    }
                    pos = idx + 1;
                }
                _ => unreachable!(),
            }
        } else {
            // No more delimiters found, unbalanced
            return None;
        }
    }

    None // Unbalanced
}

/// Find balanced quotes using SIMD acceleration
///
/// This function scans for the closing quote, handling:
/// - Escaped quotes (\")
/// - Efficient SIMD scanning
#[inline]
#[must_use]
pub fn find_balanced_quotes(input: &[u8]) -> Option<usize> {
    if input.is_empty() || input[0] != b'"' {
        return None;
    }

    let mut pos = 1;

    // Find quotes directly and only check the backslash run immediately before
    // each quote. This avoids visiting every LaTeX command backslash.
    while pos < input.len() {
        let offset = memchr(b'"', &input[pos..])?;
        let idx = pos + offset;
        if is_escaped_delimiter(input, idx) {
            pos = idx + 1;
            continue;
        }
        return Some(idx + 1); // Return position after closing quote
    }

    None // Unclosed quote
}

/// Find balanced parentheses using SIMD acceleration
#[inline]
#[must_use]
pub fn find_balanced_parentheses(input: &[u8]) -> Option<usize> {
    if input.is_empty() || input[0] != b'(' {
        return None;
    }

    let mut depth = 1;
    let mut pos = 1;

    // Use SIMD to find next delimiter: ( or )
    while pos < input.len() {
        // Find next interesting character using SIMD
        if let Some(offset) = memchr::memchr2(b'(', b')', &input[pos..]) {
            let idx = pos + offset;

            match input[idx] {
                b'(' => {
                    depth += 1;
                    pos = idx + 1;
                }
                b')' => {
                    depth -= 1;
                    if depth == 0 {
                        return Some(idx + 1); // Return position after closing paren
                    }
                    pos = idx + 1;
                }
                _ => unreachable!(),
            }
        } else {
            // No more delimiters found, unbalanced
            return None;
        }
    }

    None // Unbalanced
}

#[inline]
fn is_escaped_delimiter(input: &[u8], delimiter: usize) -> bool {
    if delimiter == 0 || input[delimiter - 1] != b'\\' {
        return false;
    }

    let mut slash_count = 0usize;
    let mut pos = delimiter;

    while pos > 0 && input[pos - 1] == b'\\' {
        slash_count += 1;
        pos -= 1;
    }

    slash_count % 2 == 1
}

/// Fast scan to find entry start (@)
#[inline]
#[must_use]
pub fn find_entry_start(input: &[u8]) -> Option<usize> {
    memchr::memchr(b'@', input)
}

/// Fast scan for field separator (=)
#[inline]
#[must_use]
pub fn find_field_separator(input: &[u8]) -> Option<usize> {
    memchr::memchr(b'=', input)
}

/// Fast scan for next comma or closing delimiter
#[inline]
#[must_use]
pub fn find_field_end(input: &[u8]) -> Option<usize> {
    memchr::memchr3(b',', b'}', b')', input)
}

/// SIMD-accelerated identifier scanning using lookup table
/// Returns the length of the identifier (alphanumeric, _, -, :, .)
#[inline]
#[must_use]
pub fn scan_identifier(input: &[u8]) -> usize {
    let mut pos = 0;
    let len = input.len();

    // Unroll by 4 for better performance
    while pos + 4 <= len {
        // Check 4 bytes at once
        if !is_identifier_byte(input[pos]) {
            return pos;
        }
        if !is_identifier_byte(input[pos + 1]) {
            return pos + 1;
        }
        if !is_identifier_byte(input[pos + 2]) {
            return pos + 2;
        }
        if !is_identifier_byte(input[pos + 3]) {
            return pos + 3;
        }
        pos += 4;
    }

    // Handle remaining bytes
    while pos < len && is_identifier_byte(input[pos]) {
        pos += 1;
    }

    pos
}

#[inline]
const fn is_identifier_byte(byte: u8) -> bool {
    IDENT_TABLE[byte as usize] == 1
}

const IDENT_TABLE: [u8; 256] = {
    let mut table = [0u8; 256];
    let mut byte = b'0';
    while byte <= b'9' {
        table[byte as usize] = 1;
        byte += 1;
    }
    byte = b'A';
    while byte <= b'Z' {
        table[byte as usize] = 1;
        byte += 1;
    }
    byte = b'a';
    while byte <= b'z' {
        table[byte as usize] = 1;
        byte += 1;
    }
    table[b'_' as usize] = 1;
    table[b'-' as usize] = 1;
    table[b':' as usize] = 1;
    table[b'.' as usize] = 1;
    table
};

/// Lookup table for whitespace characters
const WS_TABLE: [u8; 256] = {
    let mut table = [0u8; 256];
    table[b' ' as usize] = 1;
    table[b'\t' as usize] = 1;
    table[b'\n' as usize] = 1;
    table[b'\r' as usize] = 1;
    table
};

/// SIMD-accelerated whitespace scanning
/// Returns the length of the whitespace sequence
#[inline]
#[must_use]
pub const fn scan_whitespace(input: &[u8]) -> usize {
    let mut pos = 0;
    let len = input.len();

    // Unroll by 4 for better performance
    while pos + 4 <= len {
        // Check 4 bytes at once
        if WS_TABLE[input[pos] as usize] == 0 {
            return pos;
        }
        if WS_TABLE[input[pos + 1] as usize] == 0 {
            return pos + 1;
        }
        if WS_TABLE[input[pos + 2] as usize] == 0 {
            return pos + 2;
        }
        if WS_TABLE[input[pos + 3] as usize] == 0 {
            return pos + 3;
        }
        pos += 4;
    }

    // Handle remaining bytes
    while pos < len && WS_TABLE[input[pos] as usize] == 1 {
        pos += 1;
    }

    pos
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_balanced_braces() {
        assert_eq!(find_balanced_braces(b"{}"), Some(2));
        assert_eq!(find_balanced_braces(b"{hello}"), Some(7));
        assert_eq!(find_balanced_braces(b"{nested {braces}}"), Some(17));
        assert_eq!(find_balanced_braces(b"{escaped \\} brace}"), Some(18));
        assert_eq!(find_balanced_braces(b"{unclosed"), None);
        assert_eq!(find_balanced_braces(b""), None);
        assert_eq!(find_balanced_braces(b"not starting with brace"), None);
    }

    #[test]
    fn test_balanced_quotes() {
        assert_eq!(find_balanced_quotes(b"\"\""), Some(2));
        assert_eq!(find_balanced_quotes(b"\"hello\""), Some(7));
        assert_eq!(find_balanced_quotes(b"\"escaped \\\" quote\""), Some(18));
        assert_eq!(find_balanced_quotes(b"\"unclosed"), None);
        assert_eq!(find_balanced_quotes(b""), None);
        assert_eq!(find_balanced_quotes(b"not starting with quote"), None);
    }

    #[test]
    fn test_balanced_parentheses() {
        assert_eq!(find_balanced_parentheses(b"()"), Some(2));
        assert_eq!(find_balanced_parentheses(b"(hello)"), Some(7));
        assert_eq!(find_balanced_parentheses(b"(nested (parens))"), Some(17));
        assert_eq!(find_balanced_parentheses(b"(unclosed"), None);
    }
}