#![deny(missing_docs)]
#[cfg(feature = "database")]
use aho_corasick::AhoCorasick;
#[cfg(feature = "database")]
use regex::{Regex, RegexSet};
#[cfg(feature = "database")]
use serde::Deserialize;
#[cfg(feature = "database")]
use std::sync::LazyLock;
#[cfg(feature = "database")]
#[derive(Debug, Deserialize)]
pub struct CrawlerInfo {
pub pattern: String,
#[serde(default)]
pub url: Option<String>,
pub description: String,
pub tags: Vec<String>,
#[serde(default)]
pub rdns: Vec<String>,
}
#[cfg(feature = "database")]
const CRAWLER_DATABASE: &str = include_str!("../crawlers.min.json");
#[cfg(feature = "database")]
const DATABASE_CHUNK_SIZE: usize = 128;
#[cfg(feature = "database")]
static CRAWLERS: LazyLock<Vec<CrawlerInfo>> = LazyLock::new(|| {
serde_json::from_str(CRAWLER_DATABASE).expect("bundled crawler database is valid")
});
#[cfg(feature = "database")]
static DATABASE_MATCHERS: LazyLock<DatabaseMatchers> = LazyLock::new(|| {
let mut literal_patterns = Vec::new();
let mut literal_indices = Vec::new();
let mut regex_patterns = Vec::new();
for (index, crawler) in CRAWLERS.iter().enumerate() {
if let Some(literal) = regex_literal(&crawler.pattern) {
literal_indices.push(index);
literal_patterns.push(literal);
continue;
}
if Regex::new(&crawler.pattern).is_ok() {
regex_patterns.push((index, crawler.pattern.as_str()));
}
}
DatabaseMatchers {
literals: AhoCorasick::new(&literal_patterns).expect("bundled crawler literals are valid"),
literal_indices,
regexes: regex_chunks(®ex_patterns),
}
});
#[cfg(feature = "database")]
struct DatabaseMatchers {
literals: AhoCorasick,
literal_indices: Vec<usize>,
regexes: Vec<RegexChunk>,
}
#[cfg(feature = "database")]
struct RegexChunk {
patterns: RegexSet,
indices: Vec<usize>,
}
#[cfg(feature = "database")]
fn regex_literal(pattern: &str) -> Option<String> {
let mut literal = String::with_capacity(pattern.len());
let mut chars = pattern.chars();
while let Some(char) = chars.next() {
if char == '\\' {
literal.push(regex_literal_escape(chars.next()?)?);
continue;
}
if regex_meta(char) {
return None;
}
literal.push(char);
}
(!literal.is_empty()).then_some(literal)
}
#[cfg(feature = "database")]
fn regex_literal_escape(char: char) -> Option<char> {
match char {
'/' | '.' | '-' | '_' | ' ' | ':' | ')' | '(' | '!' => Some(char),
_ => None,
}
}
#[cfg(feature = "database")]
fn regex_meta(char: char) -> bool {
matches!(
char,
'.' | '^' | '$' | '*' | '+' | '?' | '{' | '}' | '[' | ']' | '|' | '(' | ')'
)
}
#[cfg(feature = "database")]
fn regex_chunks(patterns: &[(usize, &str)]) -> Vec<RegexChunk> {
patterns
.chunks(DATABASE_CHUNK_SIZE)
.map(|entries| RegexChunk {
indices: entries.iter().map(|(index, _)| *index).collect(),
patterns: RegexSet::new(entries.iter().map(|(_, pattern)| *pattern))
.expect("validated crawler regexes are valid"),
})
.collect()
}
const CRAWLER_KEYWORDS: &[(u8, &[u8])] = &[
(b'h', b"ttp://"),
(b'h', b"ttps://"),
(b'+', b"http"),
(b'@', b""),
(b'b', b"ot"),
(b'c', b"rawl"),
(b'c', b"hecker"),
(b's', b"pider"),
(b's', b"canner"),
(b's', b"crape"),
(b'f', b"eed"),
(b'f', b"etch"),
(b'm', b"onitor"),
(b'p', b"tst"),
(b'p', b"review"),
(b'a', b"rchive"),
];
const BROWSER_ENGINES: &[(u8, &[u8])] = &[
(b'g', b"ecko"),
(b'k', b"html"),
(b'k', b"onqueror"),
(b'w', b"ebkit"),
(b'c', b"hrome"),
(b'f', b"irefox"),
(b'm', b"sie"),
(b'e', b"dge"),
(b'o', b"pera"),
(b't', b"rident"),
(b'p', b"resto"),
(b'l', b"inks"),
(b'i', b"cab"),
];
const KEYWORD_FIRST_BYTES: [bool; 256] = first_byte_table(CRAWLER_KEYWORDS);
const ENGINE_FIRST_BYTES: [bool; 256] = first_byte_table(BROWSER_ENGINES);
const fn first_byte_table(needles: &[(u8, &[u8])]) -> [bool; 256] {
let mut table = [false; 256];
let mut i = 0;
while i < needles.len() {
table[needles[i].0 as usize] = true;
i += 1;
}
table
}
pub fn is_crawler(user_agent: &str) -> bool {
let bytes = user_agent.as_bytes();
let key = cache_key(bytes);
let (head, tail) = edge_words(bytes);
CACHE.with(|cache| {
let slot = &mut cache.borrow_mut()[(key as usize) & (CACHE_SLOTS - 1)];
if slot.key == key && slot.len == bytes.len() && slot.head == head && slot.tail == tail {
return slot.result;
}
let result = classify(bytes);
*slot = Entry {
key,
len: bytes.len(),
head,
tail,
result,
};
result
})
}
#[cfg(feature = "database")]
pub fn crawler_info(user_agent: &str) -> Option<&'static CrawlerInfo> {
let matchers = &*DATABASE_MATCHERS;
if let Some(matched) = matchers.literals.find(user_agent) {
return CRAWLERS.get(matchers.literal_indices[matched.pattern()]);
}
for chunk in &matchers.regexes {
let matches = chunk.patterns.matches(user_agent);
if let Some(index) = matches.iter().next() {
return CRAWLERS.get(chunk.indices[index]);
}
}
None
}
fn classify(source: &[u8]) -> bool {
if source.is_empty() {
return true;
}
let mut buffer = [0u8; 512];
if source.len() > buffer.len() {
return false;
}
let lowered = &mut buffer[..source.len()];
lowered.copy_from_slice(source);
lowered.make_ascii_lowercase();
if contains_any(lowered, CRAWLER_KEYWORDS, &KEYWORD_FIRST_BYTES) {
return true;
}
let mozilla_prefix = lowered.starts_with(b"mozilla/") || lowered.starts_with(b"opera/");
let has_engine = contains_any(lowered, BROWSER_ENGINES, &ENGINE_FIRST_BYTES);
if !mozilla_prefix {
return !has_engine;
}
!has_engine && lowered.windows(12).any(|w| w == b"(compatible;")
}
const CACHE_SLOTS: usize = 256;
#[derive(Clone, Copy)]
struct Entry {
key: u64,
len: usize,
head: u64,
tail: u64,
result: bool,
}
thread_local! {
static CACHE: std::cell::RefCell<[Entry; CACHE_SLOTS]> =
const { std::cell::RefCell::new([Entry { key: 0, len: usize::MAX, head: 0, tail: 0, result: false }; CACHE_SLOTS]) };
}
fn cache_key(bytes: &[u8]) -> u64 {
let ptr = bytes.as_ptr() as usize as u64;
ptr.rotate_left(17) ^ bytes.len() as u64
}
fn edge_words(bytes: &[u8]) -> (u64, u64) {
if bytes.len() >= 8 {
let head = u64::from_ne_bytes(bytes[..8].try_into().unwrap());
let tail = u64::from_ne_bytes(bytes[bytes.len() - 8..].try_into().unwrap());
return (head, tail);
}
let mut word = 0u64;
for (shift, &byte) in bytes.iter().enumerate() {
word |= (byte as u64) << (shift * 8);
}
(word, word)
}
fn contains_any(haystack: &[u8], needles: &[(u8, &[u8])], first_bytes: &[bool; 256]) -> bool {
for (position, &byte) in haystack.iter().enumerate() {
if !first_bytes[byte as usize] {
continue;
}
for &(first, rest) in needles {
if first != byte {
continue;
}
let after = position + 1;
if after + rest.len() <= haystack.len() && haystack[after..after + rest.len()] == *rest
{
return true;
}
}
}
false
}