use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
#[derive(Debug, Clone, Copy, FromBytes, IntoBytes, KnownLayout, Immutable)]
#[repr(C)]
pub struct HttpStatusLine {
pub version_major: u8,
pub version_minor: u8,
pub status_code: u16,
}
impl HttpStatusLine {
#[inline]
pub fn parse(buf: &[u8]) -> Option<Self> {
if buf.len() < 12 {
return None;
}
if &buf[..5] != b"HTTP/" {
return None;
}
let version_major = buf[5].wrapping_sub(b'0');
if buf[6] != b'.' {
return None;
}
let version_minor = buf[7].wrapping_sub(b'0');
if buf[8] != b' ' {
return None;
}
let d1 = buf[9].wrapping_sub(b'0') as u16;
let d2 = buf[10].wrapping_sub(b'0') as u16;
let d3 = buf[11].wrapping_sub(b'0') as u16;
if d1 > 9 || d2 > 9 || d3 > 9 {
return None;
}
let status_code = d1 * 100 + d2 * 10 + d3;
Some(Self {
version_major,
version_minor,
status_code,
})
}
#[inline]
pub fn is_informational(&self) -> bool {
self.status_code >= 100 && self.status_code < 200
}
#[inline]
pub fn is_success(&self) -> bool {
self.status_code >= 200 && self.status_code < 300
}
#[inline]
pub fn is_redirect(&self) -> bool {
self.status_code >= 300 && self.status_code < 400
}
#[inline]
pub fn is_client_error(&self) -> bool {
self.status_code >= 400 && self.status_code < 500
}
#[inline]
pub fn is_server_error(&self) -> bool {
self.status_code >= 500 && self.status_code < 600
}
#[inline]
pub fn is_retryable(&self) -> bool {
match self.status_code {
429 | 408 => true,
500 | 502 | 503 | 504 | 507 | 508 | 598 | 599 => true,
_ => false,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum ContentTypeClass {
Html = 0,
Xml = 1,
Json = 2,
PlainText = 3,
WebAsset = 4,
Media = 5,
Binary = 6,
Unknown = 7,
}
impl ContentTypeClass {
#[inline]
pub fn should_crawl(&self) -> bool {
matches!(self, Self::Html | Self::Xml)
}
#[inline]
pub fn is_binary(&self) -> bool {
matches!(self, Self::Media | Self::Binary)
}
}
#[inline]
pub fn classify_content_type(raw: &[u8]) -> ContentTypeClass {
let mime = match memchr::memchr(b';', raw) {
Some(pos) => &raw[..pos],
None => raw,
};
let mime = trim_ascii_end(mime);
if mime.is_empty() {
return ContentTypeClass::Unknown;
}
match mime[0] | 0x20 {
b't' => classify_text(mime),
b'a' => classify_application(mime),
b'i' => {
if starts_with_ignore_case(mime, b"image/") {
ContentTypeClass::Media
} else {
ContentTypeClass::Unknown
}
}
b'v' => {
if starts_with_ignore_case(mime, b"video/") {
ContentTypeClass::Media
} else {
ContentTypeClass::Unknown
}
}
b'm' => {
if starts_with_ignore_case(mime, b"multipart/") {
ContentTypeClass::Unknown
} else {
ContentTypeClass::Unknown
}
}
_ => ContentTypeClass::Unknown,
}
}
#[inline]
fn classify_text(mime: &[u8]) -> ContentTypeClass {
if mime.len() < 6 || !starts_with_ignore_case(&mime[..5], b"text/") {
return ContentTypeClass::Unknown;
}
match mime[5] | 0x20 {
b'h' => {
if starts_with_ignore_case(&mime[5..], b"html") {
ContentTypeClass::Html
} else {
ContentTypeClass::Unknown
}
}
b'x' => {
if starts_with_ignore_case(&mime[5..], b"xml") {
ContentTypeClass::Xml
} else {
ContentTypeClass::Unknown
}
}
b'p' => {
if starts_with_ignore_case(&mime[5..], b"plain") {
ContentTypeClass::PlainText
} else {
ContentTypeClass::Unknown
}
}
b'c' => {
if starts_with_ignore_case(&mime[5..], b"css") {
ContentTypeClass::WebAsset
} else {
ContentTypeClass::Unknown
}
}
b'j' => {
if starts_with_ignore_case(&mime[5..], b"javascript") {
ContentTypeClass::WebAsset
} else {
ContentTypeClass::Unknown
}
}
_ => ContentTypeClass::Unknown,
}
}
#[inline]
fn classify_application(mime: &[u8]) -> ContentTypeClass {
if mime.len() < 13 || !starts_with_ignore_case(&mime[..12], b"application/") {
return ContentTypeClass::Unknown;
}
let rest = &mime[12..]; match rest[0] | 0x20 {
b'x' => {
if starts_with_ignore_case(rest, b"xhtml") {
ContentTypeClass::Html
} else if starts_with_ignore_case(rest, b"xml") {
ContentTypeClass::Xml
} else if rest.len() >= 2 && rest[1] | 0x20 == b'-' {
ContentTypeClass::Binary
} else {
ContentTypeClass::Unknown
}
}
b'r' => {
if starts_with_ignore_case(rest, b"rss+xml") {
ContentTypeClass::Xml
} else {
ContentTypeClass::Unknown
}
}
b'a' => {
if starts_with_ignore_case(rest, b"atom+xml") {
ContentTypeClass::Xml
} else {
ContentTypeClass::Unknown
}
}
b'j' => {
if starts_with_ignore_case(rest, b"json")
|| starts_with_ignore_case(rest, b"javascript")
{
if rest.len() >= 10 && rest[1] | 0x20 == b'a' {
ContentTypeClass::WebAsset
} else {
ContentTypeClass::Json
}
} else {
ContentTypeClass::Unknown
}
}
b'l' => {
if starts_with_ignore_case(rest, b"ld+json") {
ContentTypeClass::Json
} else {
ContentTypeClass::Unknown
}
}
b'w' => {
if starts_with_ignore_case(rest, b"wasm") {
ContentTypeClass::WebAsset
} else {
ContentTypeClass::Unknown
}
}
b'p' => ContentTypeClass::Binary,
b'z' => ContentTypeClass::Binary,
b'o' => {
if starts_with_ignore_case(rest, b"octet-stream") {
ContentTypeClass::Binary
} else {
ContentTypeClass::Unknown
}
}
b'v' => {
if starts_with_ignore_case(rest, b"vnd.") {
ContentTypeClass::Binary
} else {
ContentTypeClass::Unknown
}
}
_ => ContentTypeClass::Unknown,
}
}
#[inline]
fn starts_with_ignore_case(haystack: &[u8], needle: &[u8]) -> bool {
let len = needle.len();
if haystack.len() < len {
return false;
}
let h = &haystack[..len];
let chunks = len / 8;
for i in 0..chunks {
let off = i * 8;
let mut hw = u64::from_ne_bytes([
h[off],
h[off + 1],
h[off + 2],
h[off + 3],
h[off + 4],
h[off + 5],
h[off + 6],
h[off + 7],
]);
let mut nw = u64::from_ne_bytes([
needle[off],
needle[off + 1],
needle[off + 2],
needle[off + 3],
needle[off + 4],
needle[off + 5],
needle[off + 6],
needle[off + 7],
]);
hw |= 0x2020_2020_2020_2020;
nw |= 0x2020_2020_2020_2020;
if hw != nw {
return false;
}
}
for j in (chunks * 8)..len {
if h[j] | 0x20 != needle[j] | 0x20 {
return false;
}
}
true
}
#[inline]
fn trim_ascii_end(s: &[u8]) -> &[u8] {
let mut end = s.len();
while end > 0 && s[end - 1].is_ascii_whitespace() {
end -= 1;
}
&s[..end]
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, FromBytes, IntoBytes, KnownLayout, Immutable)]
#[repr(C)]
pub struct CacheEntryHeader {
pub magic: u32,
pub url_len: u32,
pub headers_len: u32,
pub body_len: u32,
pub cached_at: u64,
pub ttl_secs: u32,
pub status_code: u16,
pub version: u8,
pub content_type: u8,
}
impl CacheEntryHeader {
pub const MAGIC: u32 = u32::from_le_bytes(*b"SPDR");
pub const VERSION: u8 = 1;
pub const SIZE: usize = std::mem::size_of::<Self>();
#[inline]
pub fn from_bytes(buf: &[u8]) -> Option<&Self> {
if buf.len() < Self::SIZE {
return None;
}
let (header_ref, _) = zerocopy::Ref::<&[u8], Self>::from_prefix(buf).ok()?;
let header: &Self = zerocopy::Ref::into_ref(header_ref);
if header.magic != Self::MAGIC || header.version != Self::VERSION {
return None;
}
Some(header)
}
pub fn new(
status_code: u16,
content_type: ContentTypeClass,
url_len: u32,
headers_len: u32,
body_len: u32,
ttl_secs: u32,
) -> Self {
Self {
magic: Self::MAGIC,
version: Self::VERSION,
content_type: content_type as u8,
status_code,
url_len,
headers_len,
body_len,
cached_at: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
ttl_secs,
}
}
#[inline]
pub fn is_expired(&self) -> bool {
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
now > self.cached_at + self.ttl_secs as u64
}
#[inline]
pub fn total_entry_size(&self) -> usize {
Self::SIZE + self.url_len as usize + self.headers_len as usize + self.body_len as usize
}
#[inline]
pub fn url_from<'a>(&self, payload: &'a [u8]) -> Option<&'a [u8]> {
let len = self.url_len as usize;
if payload.len() >= len {
Some(&payload[..len])
} else {
None
}
}
#[inline]
pub fn body_from<'a>(&self, payload: &'a [u8]) -> Option<&'a [u8]> {
let url_end = self.url_len as usize;
let headers_end = url_end + self.headers_len as usize;
let body_end = headers_end + self.body_len as usize;
if payload.len() >= body_end {
Some(&payload[headers_end..body_end])
} else {
None
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, FromBytes, IntoBytes, KnownLayout, Immutable)]
#[repr(C)]
pub struct DnsCacheRecord {
pub addr_bytes: [u8; 16],
pub ttl_secs: u32,
pub hostname_len: u16,
pub addr_type: u8,
pub _pad: u8,
}
impl DnsCacheRecord {
pub const SIZE: usize = std::mem::size_of::<Self>();
pub fn from_ipv4(addr: [u8; 4], ttl_secs: u32, hostname_len: u16) -> Self {
let mut addr_bytes = [0u8; 16];
addr_bytes[..4].copy_from_slice(&addr);
Self {
addr_type: 4,
addr_bytes,
ttl_secs,
hostname_len,
_pad: 0,
}
}
pub fn from_ipv6(addr: [u8; 16], ttl_secs: u32, hostname_len: u16) -> Self {
Self {
addr_type: 6,
addr_bytes: addr,
ttl_secs,
hostname_len,
_pad: 0,
}
}
#[inline]
pub fn from_bytes(buf: &[u8]) -> Option<&Self> {
if buf.len() < Self::SIZE {
return None;
}
let (r, _) = zerocopy::Ref::<&[u8], Self>::from_prefix(buf).ok()?;
Some(zerocopy::Ref::into_ref(r))
}
#[inline]
pub fn to_ip_addr(&self) -> std::net::IpAddr {
match self.addr_type {
4 => {
let mut octets = [0u8; 4];
octets.copy_from_slice(&self.addr_bytes[..4]);
std::net::IpAddr::V4(std::net::Ipv4Addr::from(octets))
}
_ => std::net::IpAddr::V6(std::net::Ipv6Addr::from(self.addr_bytes)),
}
}
}
#[inline]
pub fn extract_content_length(headers_raw: &[u8]) -> Option<u64> {
const NEEDLE: &[u8] = b"content-length";
let mut i = 0;
while i + NEEDLE.len() + 1 < headers_raw.len() {
if headers_raw[i] | 0x20 == b'c' && starts_with_ignore_case(&headers_raw[i..], NEEDLE) {
let after = i + NEEDLE.len();
let mut j = after;
while j < headers_raw.len() && (headers_raw[j] == b' ' || headers_raw[j] == b':') {
j += 1;
}
let mut val: u64 = 0;
while j < headers_raw.len() && headers_raw[j].is_ascii_digit() {
val = val
.wrapping_mul(10)
.wrapping_add((headers_raw[j] - b'0') as u64);
j += 1;
}
if j > after + 1 {
return Some(val);
}
}
while i < headers_raw.len() && headers_raw[i] != b'\n' {
i += 1;
}
i += 1; }
None
}
#[inline]
pub fn extract_content_type_bytes(headers_raw: &[u8]) -> Option<&[u8]> {
const NEEDLE: &[u8] = b"content-type";
let mut i = 0;
while i + NEEDLE.len() + 1 < headers_raw.len() {
if headers_raw[i] | 0x20 == b'c' && starts_with_ignore_case(&headers_raw[i..], NEEDLE) {
let mut j = i + NEEDLE.len();
while j < headers_raw.len() && (headers_raw[j] == b':' || headers_raw[j] == b' ') {
j += 1;
}
let start = j;
while j < headers_raw.len() && headers_raw[j] != b'\r' && headers_raw[j] != b'\n' {
j += 1;
}
if j > start {
return Some(&headers_raw[start..j]);
}
}
while i < headers_raw.len() && headers_raw[i] != b'\n' {
i += 1;
}
i += 1;
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_status_200() {
let sl = HttpStatusLine::parse(b"HTTP/1.1 200 OK\r\n").unwrap();
assert_eq!(sl.version_major, 1);
assert_eq!(sl.version_minor, 1);
assert_eq!(sl.status_code, 200);
assert!(sl.is_success());
}
#[test]
fn test_parse_status_404() {
let sl = HttpStatusLine::parse(b"HTTP/1.1 404 Not Found\r\n").unwrap();
assert_eq!(sl.status_code, 404);
assert!(sl.is_client_error());
}
#[test]
fn test_parse_status_503() {
let sl = HttpStatusLine::parse(b"HTTP/1.1 503 Service Unavailable\r\n").unwrap();
assert_eq!(sl.status_code, 503);
assert!(sl.is_server_error());
assert!(sl.is_retryable());
}
#[test]
fn test_parse_http10() {
let sl = HttpStatusLine::parse(b"HTTP/1.0 301 Moved Permanently\r\n").unwrap();
assert_eq!(sl.version_major, 1);
assert_eq!(sl.version_minor, 0);
assert_eq!(sl.status_code, 301);
assert!(sl.is_redirect());
}
#[test]
fn test_parse_minimal() {
let sl = HttpStatusLine::parse(b"HTTP/1.1 200").unwrap();
assert_eq!(sl.status_code, 200);
}
#[test]
fn test_parse_too_short() {
assert!(HttpStatusLine::parse(b"HTTP/1.1 20").is_none());
}
#[test]
fn test_parse_bad_prefix() {
assert!(HttpStatusLine::parse(b"HTTZ/1.1 200 OK\r\n").is_none());
}
#[test]
fn test_parse_bad_version_sep() {
assert!(HttpStatusLine::parse(b"HTTP/1X1 200 OK\r\n").is_none());
}
#[test]
fn test_status_429_retryable() {
let sl = HttpStatusLine::parse(b"HTTP/1.1 429 Too Many Requests\r\n").unwrap();
assert!(sl.is_retryable());
}
#[test]
fn test_status_201_not_retryable() {
let sl = HttpStatusLine::parse(b"HTTP/1.1 201 Created\r\n").unwrap();
assert!(!sl.is_retryable());
}
#[test]
fn test_informational() {
let sl = HttpStatusLine::parse(b"HTTP/1.1 100 Continue\r\n").unwrap();
assert!(sl.is_informational());
}
#[test]
fn test_classify_html() {
assert_eq!(
classify_content_type(b"text/html; charset=utf-8"),
ContentTypeClass::Html
);
assert_eq!(classify_content_type(b"TEXT/HTML"), ContentTypeClass::Html);
}
#[test]
fn test_classify_xhtml() {
assert_eq!(
classify_content_type(b"application/xhtml+xml"),
ContentTypeClass::Html
);
}
#[test]
fn test_classify_xml() {
assert_eq!(
classify_content_type(b"application/xml"),
ContentTypeClass::Xml
);
assert_eq!(classify_content_type(b"text/xml"), ContentTypeClass::Xml);
assert_eq!(
classify_content_type(b"application/rss+xml"),
ContentTypeClass::Xml
);
}
#[test]
fn test_classify_json() {
assert_eq!(
classify_content_type(b"application/json"),
ContentTypeClass::Json
);
assert_eq!(
classify_content_type(b"application/ld+json"),
ContentTypeClass::Json
);
}
#[test]
fn test_classify_media() {
assert_eq!(
classify_content_type(b"image/jpeg"),
ContentTypeClass::Media
);
assert_eq!(classify_content_type(b"video/mp4"), ContentTypeClass::Media);
assert_eq!(classify_content_type(b"image/png"), ContentTypeClass::Media);
}
#[test]
fn test_classify_binary() {
assert_eq!(
classify_content_type(b"application/pdf"),
ContentTypeClass::Binary
);
assert_eq!(
classify_content_type(b"application/zip"),
ContentTypeClass::Binary
);
assert_eq!(
classify_content_type(b"application/octet-stream"),
ContentTypeClass::Binary
);
}
#[test]
fn test_classify_web_assets() {
assert_eq!(
classify_content_type(b"text/css"),
ContentTypeClass::WebAsset
);
assert_eq!(
classify_content_type(b"application/javascript"),
ContentTypeClass::WebAsset
);
}
#[test]
fn test_classify_unknown() {
assert_eq!(classify_content_type(b""), ContentTypeClass::Unknown);
assert_eq!(
classify_content_type(b"something/weird"),
ContentTypeClass::Unknown
);
}
#[test]
fn test_classify_with_params() {
assert_eq!(
classify_content_type(b"text/html; charset=utf-8; boundary=something"),
ContentTypeClass::Html
);
}
#[test]
fn test_should_crawl() {
assert!(ContentTypeClass::Html.should_crawl());
assert!(ContentTypeClass::Xml.should_crawl());
assert!(!ContentTypeClass::Json.should_crawl());
assert!(!ContentTypeClass::Binary.should_crawl());
}
#[test]
fn test_is_binary() {
assert!(ContentTypeClass::Media.is_binary());
assert!(ContentTypeClass::Binary.is_binary());
assert!(!ContentTypeClass::Html.is_binary());
}
#[test]
fn test_cache_entry_header_roundtrip() {
let header = CacheEntryHeader::new(200, ContentTypeClass::Html, 25, 100, 5000, 3600);
let bytes = zerocopy::IntoBytes::as_bytes(&header);
assert_eq!(bytes.len(), CacheEntryHeader::SIZE);
let parsed = CacheEntryHeader::from_bytes(bytes).unwrap();
assert_eq!(parsed.status_code, 200);
assert_eq!(parsed.content_type, ContentTypeClass::Html as u8);
assert_eq!(parsed.url_len, 25);
assert_eq!(parsed.headers_len, 100);
assert_eq!(parsed.body_len, 5000);
assert_eq!(parsed.ttl_secs, 3600);
}
#[test]
fn test_cache_entry_header_bad_magic() {
let mut bytes = [0u8; CacheEntryHeader::SIZE];
bytes[0..4].copy_from_slice(b"XXXX");
assert!(CacheEntryHeader::from_bytes(&bytes).is_none());
}
#[test]
fn test_cache_entry_header_too_short() {
assert!(CacheEntryHeader::from_bytes(&[0u8; 4]).is_none());
}
#[test]
fn test_cache_entry_total_size() {
let header = CacheEntryHeader::new(200, ContentTypeClass::Html, 20, 50, 1000, 60);
assert_eq!(
header.total_entry_size(),
CacheEntryHeader::SIZE + 20 + 50 + 1000
);
}
#[test]
fn test_cache_entry_body_extraction() {
let header = CacheEntryHeader::new(200, ContentTypeClass::Html, 3, 2, 5, 60);
let payload = b"abcdeHELLO";
let body = header.body_from(payload).unwrap();
assert_eq!(body, b"HELLO");
}
#[test]
fn test_cache_entry_url_extraction() {
let header = CacheEntryHeader::new(200, ContentTypeClass::Html, 5, 0, 0, 60);
let payload = b"hello";
let url = header.url_from(payload).unwrap();
assert_eq!(url, b"hello");
}
#[test]
fn test_dns_record_ipv4() {
let rec = DnsCacheRecord::from_ipv4([192, 168, 1, 1], 300, 11);
assert_eq!(rec.addr_type, 4);
let ip = rec.to_ip_addr();
assert_eq!(
ip,
std::net::IpAddr::V4(std::net::Ipv4Addr::new(192, 168, 1, 1))
);
}
#[test]
fn test_dns_record_ipv6() {
let addr: [u8; 16] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1];
let rec = DnsCacheRecord::from_ipv6(addr, 600, 9);
assert_eq!(rec.addr_type, 6);
let ip = rec.to_ip_addr();
assert_eq!(ip, std::net::IpAddr::V6(std::net::Ipv6Addr::LOCALHOST));
}
#[test]
fn test_dns_record_roundtrip() {
let rec = DnsCacheRecord::from_ipv4([10, 0, 0, 1], 120, 7);
let bytes = zerocopy::IntoBytes::as_bytes(&rec);
let parsed = DnsCacheRecord::from_bytes(bytes).unwrap();
assert_eq!(parsed.addr_type, 4);
assert_eq!(parsed.ttl_secs, 120);
assert_eq!(parsed.hostname_len, 7);
}
#[test]
fn test_extract_content_length() {
let raw = b"Host: example.com\r\nContent-Length: 12345\r\nConnection: keep-alive\r\n";
assert_eq!(extract_content_length(raw), Some(12345));
}
#[test]
fn test_extract_content_length_case_insensitive() {
let raw = b"content-length: 999\r\n";
assert_eq!(extract_content_length(raw), Some(999));
}
#[test]
fn test_extract_content_length_missing() {
let raw = b"Host: example.com\r\n";
assert_eq!(extract_content_length(raw), None);
}
#[test]
fn test_extract_content_type_bytes() {
let raw = b"Content-Type: text/html; charset=utf-8\r\nHost: x\r\n";
let ct = extract_content_type_bytes(raw).unwrap();
assert_eq!(ct, b"text/html; charset=utf-8");
}
#[test]
fn test_extract_content_type_missing() {
let raw = b"Host: example.com\r\n";
assert!(extract_content_type_bytes(raw).is_none());
}
#[test]
fn test_status_line_size() {
assert_eq!(std::mem::size_of::<HttpStatusLine>(), 4);
}
#[test]
fn test_cache_header_size_stable() {
assert_eq!(CacheEntryHeader::SIZE, 32);
}
#[test]
fn test_dns_record_size_stable() {
assert_eq!(DnsCacheRecord::SIZE, 24);
}
#[test]
fn test_classify_content_type_trailing_space() {
assert_eq!(
classify_content_type(b"text/html "),
ContentTypeClass::Html
);
}
#[test]
fn test_classify_vnd_prefix() {
assert_eq!(
classify_content_type(b"application/vnd.ms-excel"),
ContentTypeClass::Binary
);
}
#[test]
fn test_status_line_as_bytes() {
let sl = HttpStatusLine {
version_major: 1,
version_minor: 1,
status_code: 200,
};
let bytes = zerocopy::IntoBytes::as_bytes(&sl);
assert_eq!(bytes.len(), 4);
let parsed: HttpStatusLine = zerocopy::FromBytes::read_from_bytes(bytes).unwrap();
assert_eq!(sl.status_code, parsed.status_code);
}
}