use regex::Regex;
use url::Url;
use crate::error::CrawlError;
use crate::http::http_fetch;
use crate::robots::{RobotsRules, parse_robots_txt};
use crate::types::CrawlConfig;
pub(crate) fn find_ascii_case_insensitive(haystack: &str, needle: &str) -> Option<usize> {
let haystack_bytes = haystack.as_bytes();
let needle_bytes = needle.as_bytes();
if needle_bytes.len() > haystack_bytes.len() {
return None;
}
(0..=(haystack_bytes.len() - needle_bytes.len())).find(|&i| {
haystack_bytes[i..i + needle_bytes.len()]
.iter()
.zip(needle_bytes.iter())
.all(|(h, n)| h.to_ascii_lowercase() == *n)
})
}
pub(crate) fn compile_regexes(patterns: &[String]) -> Result<Vec<Regex>, CrawlError> {
patterns
.iter()
.map(|pat| Regex::new(pat).map_err(|e| CrawlError::Other(format!("invalid regex pattern \"{pat}\": {e}"))))
.collect()
}
pub(crate) async fn fetch_robots_rules(
url: &str,
config: &CrawlConfig,
client: &reqwest::Client,
) -> Option<RobotsRules> {
let parsed = Url::parse(url).ok()?;
let robots_url = format!("{}://{}/robots.txt", parsed.scheme(), parsed.host_str()?);
let ua = config
.user_agent
.as_deref()
.unwrap_or(concat!("kreuzcrawl/", env!("CARGO_PKG_VERSION")));
let resp = http_fetch(&robots_url, config, &std::collections::HashMap::new(), client)
.await
.ok()?;
if resp.status >= 400 {
return None;
}
Some(parse_robots_txt(&resp.body, ua))
}