auto_encoder 0.2.0

Auto encoding library
Documentation
use crate::meta::{HtmlMetadata, ASSET_NUMBERS, FIRST_BYTE_MAP};

/// Checks if the file is a known binary format using its initial bytes.
#[inline]
pub fn is_binary_file(content: &[u8]) -> bool {
    if content.is_empty() {
        return false;
    }

    if let Some(&keys) = FIRST_BYTE_MAP.get(&content[0]) {
        for &key in keys {
            if let Some(&k) = ASSET_NUMBERS.get(key) {
                if content.len() >= k.len() && &content[..k.len()] == k {
                    return true;
                }
            }
        }
    }
    false
}

#[inline(always)]
fn find_byte(haystack: &[u8], needle: u8) -> Option<usize> {
    memchr::memchr(needle, haystack)
}

#[inline(always)]
fn extract_quoted_or_unquoted(after_attr: &[u8]) -> Option<String> {
    let &quote = after_attr.get(0)?;
    if quote == b'"' || quote == b'\'' {
        let quote_close = find_byte(&after_attr[1..], quote)?;
        std::str::from_utf8(&after_attr[1..quote_close + 1])
            .ok()
            .map(String::from)
    } else {
        let end = after_attr
            .iter()
            .position(|&c| c.is_ascii_whitespace() || c == b'>')?;
        std::str::from_utf8(&after_attr[..end])
            .ok()
            .map(String::from)
    }
}

#[inline(always)]
fn extract_charset_quoted(after_charset: &[u8]) -> Option<String> {
    let (&quote, remaining) = after_charset.split_first()?;
    if quote != b'"' && quote != b'\'' {
        return None;
    }
    let quote_close = find_byte(remaining, quote)?;
    std::str::from_utf8(&remaining[..quote_close])
        .ok()
        .map(String::from)
}

/// Detect the language of a HTML resource. This does nothing without the "encoding" flag enabled.
#[inline]
pub fn detect_language(html_content: &[u8]) -> Option<String> {
    if html_content.is_empty() {
        return None;
    }
    let search_area = &html_content[..html_content.len().min(1024)];
    let html_start = find_subsequence(search_area, b"<html")?;
    let rest = &search_area[html_start..];
    let lang_start = find_subsequence(rest, b"lang=")?;
    extract_quoted_or_unquoted(&rest[lang_start + 5..])
}

/// Detect the encoding used in an HTML file.
#[inline]
pub fn detect_encoding(html_content: &[u8]) -> Option<String> {
    let search_area = &html_content[..html_content.len().min(1024)];
    let mut pos = 0;

    while pos < search_area.len() {
        let remaining = &search_area[pos..];
        let meta_start = find_subsequence(remaining, b"<meta")?;
        let meta_content = &remaining[meta_start..];
        pos += meta_start + 5;

        // Case 1: <meta charset="...">
        if let Some(charset_start) = find_subsequence(meta_content, b"charset=") {
            if let Some(result) = extract_charset_quoted(&meta_content[charset_start + 8..]) {
                return Some(result);
            }
        }

        // Case 2: <meta http-equiv="Content-Type" content="...; charset=...">
        if let Some(he_start) = find_subsequence(meta_content, b"http-equiv=\"Content-Type\"") {
            let after_he = &meta_content[he_start + 25..];
            if let Some(cs) = find_subsequence(after_he, b"content=") {
                let after_content = &after_he[cs + 8..];
                if let Some((&quote, rest)) = after_content.split_first() {
                    if quote == b'"' || quote == b'\'' {
                        if let Some(end) = find_byte(rest, quote) {
                            let full = &rest[..end];
                            if let Some(cp) = find_subsequence(full, b"charset=") {
                                let after_cs = &full[cp + 8..];
                                let cs_end = after_cs
                                    .iter()
                                    .position(|&c| c == b';' || c.is_ascii_whitespace())
                                    .unwrap_or(after_cs.len());
                                if let Ok(charset) = std::str::from_utf8(&after_cs[..cs_end]) {
                                    return Some(charset.to_string());
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    None
}

/// Detect the html metadata to process the element based on the encoding or language found.
#[inline]
pub fn detect_html_metadata(html_content: &[u8]) -> Option<HtmlMetadata> {
    if html_content.is_empty() {
        return Some(HtmlMetadata {
            lang: None,
            encoding: None,
        });
    }

    let search_area = &html_content[..html_content.len().min(1024)];

    // Detect language
    let lang = find_subsequence(search_area, b"<html").and_then(|html_start| {
        let rest = &search_area[html_start..];
        find_subsequence(rest, b"lang=")
            .and_then(|lang_start| extract_quoted_or_unquoted(&rest[lang_start + 5..]))
    });

    // Detect encoding
    let mut encoding: Option<String> = None;
    let mut pos = 0;
    while pos < search_area.len() {
        let remaining = &search_area[pos..];
        let meta_start = match find_subsequence(remaining, b"<meta") {
            Some(s) => s,
            None => break,
        };
        let meta_content = &remaining[meta_start..];
        pos += meta_start + 5;

        if let Some(charset_start) = find_subsequence(meta_content, b"charset=") {
            encoding = extract_charset_quoted(&meta_content[charset_start + 8..]);
            if encoding.is_some() {
                break;
            }
        }

        if let Some(he_start) = find_subsequence(meta_content, b"http-equiv=\"Content-Type\"") {
            let after_he = &meta_content[he_start + 25..];
            if let Some(cs) = find_subsequence(after_he, b"content=") {
                let after_content = &after_he[cs + 8..];
                if let Some((&quote, rest)) = after_content.split_first() {
                    if quote == b'"' || quote == b'\'' {
                        if let Some(end) = find_byte(rest, quote) {
                            let full = &rest[..end];
                            if let Some(cp) = find_subsequence(full, b"charset=") {
                                let after_cs = &full[cp + 8..];
                                let cs_end = after_cs
                                    .iter()
                                    .position(|&c| c == b';' || c.is_ascii_whitespace())
                                    .unwrap_or(after_cs.len());
                                encoding = std::str::from_utf8(&after_cs[..cs_end])
                                    .ok()
                                    .map(String::from);
                                if encoding.is_some() {
                                    break;
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    Some(HtmlMetadata { lang, encoding })
}

/// Helper function to find a subsequence in a slice. Uses SIMD-accelerated search.
#[inline(always)]
pub fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
    memchr::memmem::find(haystack, needle)
}