mod types;
pub use types::{ClassificationResult, ContentCategory, ContentFormat};
mod magic;
pub use magic::detect_magic;
mod text_gate; pub use text_gate::{TextGateResult, inspect_text};
mod heuristic; pub use heuristic::classify_text;
pub fn classify_content(bytes: &[u8]) -> ClassificationResult {
if bytes.is_empty() {
return ClassificationResult {
mime_type: "application/octet-stream".into(),
category: ContentCategory::Unknown,
format: ContentFormat::Unknown,
confidence: 0.0,
is_extractable: false,
};
}
if let Some(result) = detect_magic(bytes) {
return result;
}
match inspect_text(bytes) {
TextGateResult::Empty => ClassificationResult {
mime_type: "application/octet-stream".into(),
category: ContentCategory::Unknown,
format: ContentFormat::Unknown,
confidence: 0.0,
is_extractable: false,
},
TextGateResult::Binary => ClassificationResult {
mime_type: "application/octet-stream".into(),
category: ContentCategory::Binary,
format: ContentFormat::Unknown,
confidence: 0.2,
is_extractable: false,
},
TextGateResult::Text {
bom_stripped_offset,
} => {
classify_text(&bytes[bom_stripped_offset..])
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn classify_empty_returns_unknown() {
let r = classify_content(&[]);
assert_eq!(r.category, ContentCategory::Unknown);
assert_eq!(r.format, ContentFormat::Unknown);
assert_eq!(r.confidence, 0.0);
assert!(!r.is_extractable);
}
#[test]
fn classify_never_panics() {
let _ = classify_content(b"anything");
let _ = classify_content(&[0, 1, 2, 3]);
let _ = classify_content(&[0xFF; 1024]);
}
#[test]
fn classify_pdf() {
let r = classify_content(b"%PDF-1.7\n%\xE2\xE3\xCF\xD3\n");
assert_eq!(r.format, ContentFormat::Pdf);
assert_eq!(r.category, ContentCategory::Binary);
assert!((r.confidence - 1.0).abs() < f32::EPSILON);
}
#[test]
fn classify_png() {
let r = classify_content(b"\x89PNG\r\n\x1a\n");
assert_eq!(r.format, ContentFormat::Png);
assert_eq!(r.category, ContentCategory::Binary);
}
#[test]
fn classify_jpeg() {
let r = classify_content(b"\xFF\xD8\xFF\xE0\x00\x10JFIF");
assert_eq!(r.format, ContentFormat::Jpeg);
}
#[test]
fn classify_zip() {
let r = classify_content(b"PK\x03\x04");
assert_eq!(r.format, ContentFormat::Zip);
}
#[test]
fn classify_markdown_heading_only() {
let r = classify_content(b"# H\n## H2\ncontent");
assert_eq!(r.format, ContentFormat::Markdown);
}
#[test]
fn classify_json_object() {
let r = classify_content(b"{\"key\": 42, \"ok\": true}");
assert_eq!(r.format, ContentFormat::Json);
assert_eq!(r.category, ContentCategory::Structured);
}
#[test]
fn classify_rust_code() {
let r = classify_content(b"pub fn main() {\n let x = 1;\n}");
assert_eq!(r.format, ContentFormat::Rust);
assert_eq!(r.category, ContentCategory::Text);
}
#[test]
fn classify_javascript_code() {
let r = classify_content(b"const x = 1;\nfunction f() { return x; }");
assert_eq!(r.format, ContentFormat::JavaScript);
}
#[test]
fn classify_plain_text() {
let r = classify_content(b"just a simple paragraph of prose");
assert_eq!(r.format, ContentFormat::PlainText);
}
#[test]
fn classify_utf8_bom_markdown() {
let input = b"\xEF\xBB\xBF# Title\n\nbody";
let r = classify_content(input);
assert_eq!(r.format, ContentFormat::Markdown);
}
#[test]
fn classify_utf16_bom_falls_through() {
let input = b"\xFF\xFE";
let _ = classify_content(input);
}
#[test]
fn classify_binary_without_magic() {
let r = classify_content(&[0x00, 0x01, 0x02, 0x00, 0xFF, 0xFE, 0xFE, 0xFF, 0x00, 0x00]);
assert_eq!(r.category, ContentCategory::Binary);
assert_eq!(r.format, ContentFormat::Unknown);
}
#[test]
fn classify_json_category_is_structured() {
let r = classify_content(b"[1, 2, 3]");
assert_eq!(r.category, ContentCategory::Structured);
}
#[test]
fn magic_precedence_over_heuristic() {
let mut bytes = b"\x89PNG\r\n\x1a\n".to_vec();
bytes.extend_from_slice(b"# Not actually markdown");
let r = classify_content(&bytes);
assert_eq!(r.format, ContentFormat::Png);
}
}