Skip to main content

webfetch/
media.rs

1//! Decide how to treat a fetched body. The HTML extractor only makes sense
2//! for HTML; running it over a JSON API response, a raw `.txt`, or a Markdown
3//! file would mangle or drop the content. We classify by `Content-Type` when
4//! present, and sniff the body otherwise.
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum Media {
8    Html,
9    Json,
10    Text,
11    /// Anything we don't render (binary, PDF, images): the label is the
12    /// content-type, surfaced to the caller.
13    Other(String),
14}
15
16impl Media {
17    /// A short, stable label for the `media` field of a result.
18    pub fn label(&self) -> String {
19        match self {
20            Media::Html => "html".into(),
21            Media::Json => "json".into(),
22            Media::Text => "text".into(),
23            Media::Other(ct) => ct.clone(),
24        }
25    }
26}
27
28/// Classify a body using its `Content-Type` header if available, else by
29/// sniffing the first non-whitespace bytes.
30pub fn classify(content_type: Option<&str>, body: &str) -> Media {
31    if let Some(ct) = content_type {
32        let essence = ct
33            .split(';')
34            .next()
35            .unwrap_or("")
36            .trim()
37            .to_ascii_lowercase();
38        if essence.contains("html") || essence == "application/xhtml+xml" {
39            return Media::Html;
40        }
41        if essence.contains("json") {
42            return Media::Json;
43        }
44        if essence.starts_with("text/") {
45            return Media::Text;
46        }
47        if !essence.is_empty() {
48            return Media::Other(essence);
49        }
50    }
51    sniff(body)
52}
53
54fn sniff(body: &str) -> Media {
55    let trimmed = body.trim_start();
56    match trimmed.as_bytes().first() {
57        Some(b'<') => Media::Html,
58        Some(b'{') | Some(b'[') => {
59            if serde_json::from_str::<serde_json::Value>(trimmed).is_ok() {
60                Media::Json
61            } else {
62                Media::Text
63            }
64        }
65        _ => Media::Text,
66    }
67}