Skip to main content

djvu_rs/
metadata.rs

1//! DjVu document metadata parser — phase 4 extension.
2//!
3//! Parses METa (plain text) and METz (BZZ-compressed) metadata chunks into a
4//! structured [`DjVuMetadata`] value.
5//!
6//! ## Key public types
7//!
8//! - [`DjVuMetadata`] — key-value metadata extracted from a DjVu document
9//! - [`MetadataError`] — typed errors from this module
10//!
11//! ## Format notes
12//!
13//! METa/METz encode metadata as an S-expression:
14//!
15//! ```text
16//! (metadata
17//!   (author "Author Name")
18//!   (title "Book Title")
19//!   (subject "Subject")
20//!   (year "2023")
21//!   (keywords "keyword1, keyword2")
22//! )
23//! ```
24//!
25//! This module accepts arbitrary key names; well-known keys populate dedicated
26//! fields while anything else goes into [`DjVuMetadata::extra`].
27
28#[cfg(not(feature = "std"))]
29use alloc::{
30    string::{String, ToString},
31    vec::Vec,
32};
33
34use crate::{bzz_new::bzz_decode, error::BzzError};
35
36// ---- Error ------------------------------------------------------------------
37
38/// Errors from metadata parsing.
39#[derive(Debug, thiserror::Error)]
40pub enum MetadataError {
41    /// BZZ decompression failed.
42    #[error("bzz decode failed: {0}")]
43    Bzz(#[from] BzzError),
44
45    /// The chunk is not valid UTF-8.
46    #[error("metadata chunk is not valid UTF-8")]
47    InvalidUtf8,
48}
49
50// ---- Public types -----------------------------------------------------------
51
52/// Key-value metadata extracted from a DjVu document's METa/METz chunk.
53///
54/// Well-known keys populate dedicated fields; everything else is in
55/// [`DjVuMetadata::extra`].  All values are plain strings — the DjVu format
56/// does not define structured types beyond that.
57#[derive(Debug, Clone, Default, PartialEq, Eq)]
58#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
59pub struct DjVuMetadata {
60    /// Document title.
61    pub title: Option<String>,
62    /// Author name(s).
63    pub author: Option<String>,
64    /// Subject or description.
65    pub subject: Option<String>,
66    /// Publisher name.
67    pub publisher: Option<String>,
68    /// Publication year.
69    pub year: Option<String>,
70    /// Comma-separated keywords (raw string as stored).
71    pub keywords: Option<String>,
72    /// All other key-value pairs, in document order.
73    pub extra: Vec<(String, String)>,
74}
75
76// ---- Entry points -----------------------------------------------------------
77
78/// Parse a METa (uncompressed) metadata chunk.
79///
80/// `data` is the raw bytes of the METa chunk (not including the 4-byte chunk
81/// ID or the 4-byte length prefix — just the payload).
82pub fn parse_metadata(data: &[u8]) -> Result<DjVuMetadata, MetadataError> {
83    let text = core::str::from_utf8(data).map_err(|_| MetadataError::InvalidUtf8)?;
84    Ok(parse_metadata_text(text))
85}
86
87/// Parse a METz (BZZ-compressed) metadata chunk.
88///
89/// Decompresses with BZZ first, then delegates to [`parse_metadata`].
90pub fn parse_metadata_bzz(data: &[u8]) -> Result<DjVuMetadata, MetadataError> {
91    let decoded = bzz_decode(data)?;
92    parse_metadata(&decoded)
93}
94
95// ---- Internal parsing -------------------------------------------------------
96
97fn parse_metadata_text(text: &str) -> DjVuMetadata {
98    let tokens = tokenize(text);
99    let sexprs = parse_sexprs(&tokens);
100
101    let mut meta = DjVuMetadata::default();
102
103    // Look for a top-level (metadata ...) list
104    for expr in &sexprs {
105        if let SExpr::List(items) = expr
106            && let Some(SExpr::Atom(head)) = items.first()
107        {
108            if !head.eq_ignore_ascii_case("metadata") {
109                continue;
110            }
111            for item in &items[1..] {
112                if let SExpr::List(pair) = item
113                    && let (Some(SExpr::Atom(key)), Some(SExpr::Atom(val))) =
114                        (pair.first(), pair.get(1))
115                {
116                    store_kv(&mut meta, key, val);
117                }
118            }
119        }
120    }
121
122    meta
123}
124
125fn store_kv(meta: &mut DjVuMetadata, key: &str, value: &str) {
126    match key.to_lowercase().as_str() {
127        "title" => meta.title = Some(value.to_string()),
128        "author" => meta.author = Some(value.to_string()),
129        "subject" | "description" => meta.subject = Some(value.to_string()),
130        "publisher" => meta.publisher = Some(value.to_string()),
131        "year" | "date" => meta.year = Some(value.to_string()),
132        "keywords" | "keyword" => meta.keywords = Some(value.to_string()),
133        _ => meta.extra.push((key.to_string(), value.to_string())),
134    }
135}
136
137// ---- Minimal S-expression tokenizer/parser ----------------------------------
138//
139// A self-contained subset that handles the metadata format.
140// Supports atoms (unquoted), quoted strings, and nested lists.
141
142#[derive(Debug)]
143enum Token<'a> {
144    LParen,
145    RParen,
146    Atom(&'a str),
147    Quoted(String),
148}
149
150fn tokenize(input: &str) -> Vec<Token<'_>> {
151    let mut tokens = Vec::new();
152    let bytes = input.as_bytes();
153    let mut i = 0;
154
155    while i < bytes.len() {
156        match bytes.get(i) {
157            Some(b'(') => {
158                tokens.push(Token::LParen);
159                i += 1;
160            }
161            Some(b')') => {
162                tokens.push(Token::RParen);
163                i += 1;
164            }
165            Some(b'"') => {
166                i += 1;
167                let mut s = String::new();
168                while i < bytes.len() {
169                    match bytes.get(i) {
170                        Some(b'\\') if i + 1 < bytes.len() => {
171                            i += 1;
172                            if let Some(&c) = bytes.get(i) {
173                                s.push(c as char);
174                            }
175                            i += 1;
176                        }
177                        Some(b'"') => {
178                            i += 1;
179                            break;
180                        }
181                        Some(&c) => {
182                            s.push(c as char);
183                            i += 1;
184                        }
185                        None => break,
186                    }
187                }
188                tokens.push(Token::Quoted(s));
189            }
190            Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'\r') => {
191                i += 1;
192            }
193            Some(b';') => {
194                while i < bytes.len() && bytes.get(i) != Some(&b'\n') {
195                    i += 1;
196                }
197            }
198            _ => {
199                let start = i;
200                while i < bytes.len() {
201                    match bytes.get(i) {
202                        Some(b'(') | Some(b')') | Some(b'"') | Some(b' ') | Some(b'\t')
203                        | Some(b'\n') | Some(b'\r') => break,
204                        _ => i += 1,
205                    }
206                }
207                if let Some(slice) = input.get(start..i)
208                    && !slice.is_empty()
209                {
210                    tokens.push(Token::Atom(slice));
211                }
212            }
213        }
214    }
215
216    tokens
217}
218
219#[derive(Debug)]
220enum SExpr {
221    Atom(String),
222    List(Vec<SExpr>),
223}
224
225fn parse_sexprs(tokens: &[Token<'_>]) -> Vec<SExpr> {
226    let mut result = Vec::new();
227    let mut pos = 0usize;
228    while pos < tokens.len() {
229        if let Some(expr) = parse_one(tokens, &mut pos) {
230            result.push(expr);
231        }
232    }
233    result
234}
235
236fn parse_one(tokens: &[Token<'_>], pos: &mut usize) -> Option<SExpr> {
237    match tokens.get(*pos) {
238        Some(Token::LParen) => {
239            *pos += 1;
240            let mut items = Vec::new();
241            loop {
242                match tokens.get(*pos) {
243                    Some(Token::RParen) => {
244                        *pos += 1;
245                        break;
246                    }
247                    None => break,
248                    _ => {
249                        if let Some(child) = parse_one(tokens, pos) {
250                            items.push(child);
251                        } else {
252                            break;
253                        }
254                    }
255                }
256            }
257            Some(SExpr::List(items))
258        }
259        Some(Token::RParen) => {
260            *pos += 1;
261            None
262        }
263        Some(Token::Atom(s)) => {
264            let s = s.to_string();
265            *pos += 1;
266            Some(SExpr::Atom(s))
267        }
268        Some(Token::Quoted(s)) => {
269            let s = s.clone();
270            *pos += 1;
271            Some(SExpr::Atom(s))
272        }
273        None => None,
274    }
275}
276
277// ---- Tests ------------------------------------------------------------------
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn empty_input_returns_default() {
285        let meta = parse_metadata(b"").unwrap();
286        assert_eq!(meta, DjVuMetadata::default());
287    }
288
289    #[test]
290    fn basic_metadata_block() {
291        let text = br#"(metadata (title "My Book") (author "Jane Doe") (year "2023"))"#;
292        let meta = parse_metadata(text).unwrap();
293        assert_eq!(meta.title.as_deref(), Some("My Book"));
294        assert_eq!(meta.author.as_deref(), Some("Jane Doe"));
295        assert_eq!(meta.year.as_deref(), Some("2023"));
296        assert!(meta.subject.is_none());
297    }
298
299    #[test]
300    fn subject_and_keywords() {
301        let text = br#"(metadata (subject "Science") (keywords "physics, chemistry"))"#;
302        let meta = parse_metadata(text).unwrap();
303        assert_eq!(meta.subject.as_deref(), Some("Science"));
304        assert_eq!(meta.keywords.as_deref(), Some("physics, chemistry"));
305    }
306
307    #[test]
308    fn description_alias_maps_to_subject() {
309        let text = br#"(metadata (description "A long description"))"#;
310        let meta = parse_metadata(text).unwrap();
311        assert_eq!(meta.subject.as_deref(), Some("A long description"));
312    }
313
314    #[test]
315    fn date_alias_maps_to_year() {
316        let text = br#"(metadata (date "2020-01-15"))"#;
317        let meta = parse_metadata(text).unwrap();
318        assert_eq!(meta.year.as_deref(), Some("2020-01-15"));
319    }
320
321    #[test]
322    fn extra_keys_go_to_extra_vec() {
323        let text = br#"(metadata (custom-field "value1") (another "value2"))"#;
324        let meta = parse_metadata(text).unwrap();
325        assert_eq!(meta.extra.len(), 2);
326        assert_eq!(
327            meta.extra[0],
328            ("custom-field".to_string(), "value1".to_string())
329        );
330        assert_eq!(meta.extra[1], ("another".to_string(), "value2".to_string()));
331    }
332
333    #[test]
334    fn publisher_field() {
335        let text = br#"(metadata (publisher "Oxford University Press"))"#;
336        let meta = parse_metadata(text).unwrap();
337        assert_eq!(meta.publisher.as_deref(), Some("Oxford University Press"));
338    }
339
340    #[test]
341    fn case_insensitive_keys() {
342        let text = br#"(metadata (TITLE "Upper") (Author "Mixed"))"#;
343        let meta = parse_metadata(text).unwrap();
344        assert_eq!(meta.title.as_deref(), Some("Upper"));
345        assert_eq!(meta.author.as_deref(), Some("Mixed"));
346    }
347
348    #[test]
349    fn escaped_quotes_in_value() {
350        let text = br#"(metadata (title "Book with \"quotes\""))"#;
351        let meta = parse_metadata(text).unwrap();
352        assert_eq!(meta.title.as_deref(), Some(r#"Book with "quotes""#));
353    }
354
355    #[test]
356    fn no_metadata_wrapper_returns_default() {
357        // If there is no (metadata ...) block, return default
358        let text = br#"(background #ffffff)"#;
359        let meta = parse_metadata(text).unwrap();
360        assert_eq!(meta, DjVuMetadata::default());
361    }
362
363    #[test]
364    fn multiline_metadata() {
365        let text = b"(metadata\n  (title \"Line1\")\n  (author \"Line2\")\n)";
366        let meta = parse_metadata(text).unwrap();
367        assert_eq!(meta.title.as_deref(), Some("Line1"));
368        assert_eq!(meta.author.as_deref(), Some("Line2"));
369    }
370
371    #[test]
372    fn invalid_utf8_returns_error() {
373        let invalid = b"\xFF\xFE";
374        assert!(matches!(
375            parse_metadata(invalid),
376            Err(MetadataError::InvalidUtf8)
377        ));
378    }
379}