microformats 0.18.2

A union library of the Microformats types and associated parser.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#[cfg(feature = "debug_flow")]
use std::collections::HashMap;
use std::sync::Arc;

use self::element::{LinkRelationExpander, MatchedElements};
use microformats_types::{Properties, PropertyValue};
use regex::Regex;
use swc_common::{BytePos, FileName, SourceFile};
use swc_html_codegen::Emit as _;
use swc_html_parser::parser::ParserConfig;

/// A trait for custom hooks that can be called during parsing to tag or process nodes.
pub trait ParserHook: Send + Sync {
    /// Called when a property is matched on a node.
    fn on_property_matched(
        &self,
        node: &element::Node,
        name: &str,
        value: &microformats_types::PropertyValue,
    );

    /// Called when an item is matched on a node.
    fn on_item_matched(&self, node: &element::Node, item_type: &str);
}

mod element;

#[cfg(feature = "debug_flow")]
mod debug_flow;

#[cfg(feature = "debug_flow")]
pub use debug_flow::DebugHook;
mod head;
mod property;
mod test;
mod value_class;

#[cfg(feature = "picture")]
mod picture;

#[cfg(feature = "picture")]
pub use picture::{Picture, PictureParser, PictureParserBuilder, PictureSource, SrcSetEntry};

#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum Error {
    #[error("Failed to parse HTML: {0:?}")]
    Html(swc_html_parser::error::Error),

    #[error("Failed to generate HTML: {0}")]
    HtmlCodegen(String),

    #[error("Missing the parent item for a child item at the location {0:?}")]
    MissingParentItem(crate::parse::element::Placement),

    #[error("Invalid property for expansion.")]
    InvalidPropertyExpansion,

    #[error("Could not determine which item to add a property to the location of {0:?}")]
    MissingParentItemForProperty(element::Placement),

    #[error(
        "Could not determine which parent item to define a property to from the location of {0:?}"
    )]
    MissingParentItemForPropertyDeclaration(element::Placement),

    #[error("A URL to base relative URLs in this document is required.")]
    UrlBaseForDocumentRequired,

    #[error(transparent)]
    Types(#[from] microformats_types::Error),

    #[error(transparent)]
    Fmt(#[from] std::fmt::Error),

    #[error(transparent)]
    Url(#[from] url::ParseError),
}

impl From<swc_html_parser::error::Error> for Error {
    fn from(value: swc_html_parser::error::Error) -> Self {
        Self::Html(value)
    }
}

impl From<microformats_types::temporal::Error> for Error {
    fn from(value: microformats_types::temporal::Error) -> Self {
        Self::Types(microformats_types::Error::from(value))
    }
}

lazy_static::lazy_static! {
    static ref RE_WHITESPACE: Regex = Regex::new(r"(\s)+").unwrap();
    static ref RE_CLASS_NAME: Regex = Regex::new(r#"^(?P<prefix>((h|p|u|dt|e){1}))-(?P<name>([a-z0-9]+-)?[a-z]+(-[a-z]+)*)$"#).unwrap();
}

#[allow(clippy::ptr_arg)]
fn non_empty_string(s: &String) -> bool {
    !s.is_empty()
}

fn non_empty_property_value(p: &PropertyValue) -> bool {
    !p.is_empty()
}

fn remove_surrounding_whitespace(text: impl ToString) -> String {
    text.to_string()
        .trim_matches(char::is_whitespace)
        .to_string()
}

/// Find the head element in the document
#[cfg(feature = "metaformats")]
fn find_head_element(dom: &swc_html_ast::Document) -> Option<swc_html_ast::Element> {
    for child in &dom.children {
        if let swc_html_ast::Child::Element(element) = child {
            if element.tag_name == "html" {
                for html_child in &element.children {
                    if let swc_html_ast::Child::Element(html_element) = html_child {
                        if html_element.tag_name == "head" {
                            return Some(html_element.clone());
                        }
                    }
                }
            }
        }
    }
    None
}

fn merge_hash_maps(base_map: &mut Properties, addl_map: Properties) {
    for (property_name, property_value) in addl_map.into_iter() {
        if let Some(values) = base_map.get_mut(&property_name) {
            values.extend(property_value);
        } else {
            base_map.insert(property_name, property_value);
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ElementRef {
    pub index: usize,
    pub node: element::Node,
}

pub type ElementPtr = Arc<ElementRef>;

pub struct Parser {
    dom: swc_html_ast::Document,
    hook: Option<Arc<dyn ParserHook>>,
    enable_id_generation: bool,
}

impl Clone for Parser {
    fn clone(&self) -> Self {
        Self {
            dom: self.dom.clone(),
            hook: self.hook.clone(),
            enable_id_generation: self.enable_id_generation,
        }
    }
}

impl std::fmt::Debug for Parser {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str("Parser")
    }
}

impl Parser {
    /// Parses the provided HTML into a DOM document prepared for Microformats parsing.
    ///
    /// # Errors
    ///
    /// This function will return an error if the HTML could not be parsed.
    #[tracing::instrument(level = "trace", err, fields(html = html.len()))]
    pub fn from_html(html: String) -> Result<Self, crate::Error> {
        let config = ParserConfig {
            scripting_enabled: false,
            iframe_srcdoc: false,
            allow_self_closing: true,
        };
        let mut html_errors = Default::default();
        let source_file = SourceFile::new(
            FileName::Anon.into(),
            false,
            FileName::Anon.into(),
            html.into(),
            BytePos(1),
        );
        let dom = swc_html_parser::parse_file_as_document(&source_file, config, &mut html_errors)
            .map_err(Error::from)?;

        drop(html_errors); // TODO: Report this back to the caller.

        Ok(Self {
            dom,
            hook: None,
            enable_id_generation: false,
        })
    }

    /// Sets a custom hook for the parser.
    pub fn with_hook(mut self, hook: Arc<dyn ParserHook>) -> Self {
        self.hook = Some(hook);
        self
    }

    /// Enables or disables ID generation for AST elements.
    pub fn with_id_generation(mut self, enable: bool) -> Self {
        self.enable_id_generation = enable;
        self
    }

    /// With the loaded DOM in memory, parses it into a [structured document][microformats_types::Document].
    ///
    /// # Errors
    ///
    /// This function will return an error if the DOM could not be parsed, relations could not be
    /// expanded or if items could not be expanded.
    #[tracing::instrument(level = "trace", skip(self), err, fields(base_url = base_url.as_ref().map(|u|u.to_string())))]
    pub fn into_document(
        &mut self,
        base_url: Option<url::Url>,
    ) -> Result<microformats_types::Document, crate::Error> {
        // Determine if we should collect debug info
        #[cfg(feature = "debug_flow")]
        let should_track_debug = self.enable_id_generation;

        #[cfg(feature = "debug_flow")]
        let doc = if should_track_debug {
            // Set up debug context
            // Note: We can't easily get the original HTML from the DOM,
            // so we'll leave it empty for now
            let debug_ctx = microformats_types::DebugContext {
                elements: HashMap::new(),
                property_sources: Vec::new(),
                original_html: String::new(),
            };

            // Parse with debug tracking
            let (result, debug_ctx) = debug_flow::with_debug_context(debug_ctx, || {
                self.hook = Some(std::sync::Arc::new(DebugHook));
                self.parse_document_internal(base_url)
            });
            let parsed_doc = result?;

            // Store debug context in document for later conversion
            let mut doc_with_debug = parsed_doc;
            doc_with_debug._debug_context = Some(debug_ctx);
            doc_with_debug
        } else {
            // Normal parsing without debug tracking
            self.parse_document_internal(base_url)?
        };

        #[cfg(not(feature = "debug_flow"))]
        let doc = self.parse_document_internal(base_url)?;

        Ok(doc)
    }

    /// Internal parsing logic shared by both debug and non-debug paths
    fn parse_document_internal(
        &mut self,
        base_url: Option<url::Url>,
    ) -> Result<microformats_types::Document, crate::Error> {
        let mut doc: microformats_types::Document = Default::default();
        let matched_elements = MatchedElements::for_document(
            &mut self.dom,
            self.hook.clone(),
            self.enable_id_generation,
        )?;

        let base_url = matched_elements
            .discern_base_url()
            .or(base_url)
            .ok_or(Error::UrlBaseForDocumentRequired)?;

        let link_relation_expander = LinkRelationExpander {
            base_url: base_url.clone(),
            elements: matched_elements.link_relation_elements(),
        };

        link_relation_expander.expand(&mut doc)?;

        for item_elem_ptr in matched_elements.top_level_elements() {
            let item_elem_ptr_clone = item_elem_ptr.clone();
            let item = matched_elements.expand_item_from_element(item_elem_ptr, &base_url)?;
            if let Some(hook) = &self.hook {
                let item_type = item
                    .r#type
                    .first()
                    .map(|c| c.to_string())
                    .unwrap_or_else(|| "unknown".to_string());
                hook.on_item_matched(&item_elem_ptr_clone.node, &item_type);
            }
            doc.items.push(item)
        }

        // Store document URL for home page detection (must be set before metaformats parsing)
        doc.url = Some(base_url.clone());

        // Parse metaformats from head element
        #[cfg(feature = "metaformats")]
        {
            if let Some(head_element) = find_head_element(&self.dom) {
                if let Some(meta_item) =
                    head::parse_metaformats_from_head(&head_element, &base_url, doc.url.as_ref())
                {
                    doc.meta_item = Some(meta_item);
                }
            }
        }

        Ok(doc)
    }

    /// Generates HTML from the current AST, including any added attributes like data-mf2-id.
    ///
    /// # Errors
    ///
    /// This function will return an error if HTML generation fails.
    pub fn to_html(&self) -> Result<String, crate::Error> {
        use swc_html_codegen::{
            CodeGenerator, CodegenConfig, Emit,
            writer::basic::{BasicHtmlWriter, BasicHtmlWriterConfig, IndentType, LineFeed},
        };

        let mut buf = std::ffi::OsString::new();
        let mut writer = BasicHtmlWriter::new(
            &mut buf,
            None,
            BasicHtmlWriterConfig {
                indent_type: IndentType::Space,
                indent_width: 2,
                linefeed: LineFeed::LF,
            },
        );
        let mut generator = CodeGenerator::new(
            &mut writer,
            CodegenConfig {
                minify: false,
                scripting_enabled: true,
                context_element: None,
                tag_omission: Some(true),
                keep_head_and_body: Some(true),
                self_closing_void_elements: Some(true),
                quotes: Some(true),
            },
        );
        generator
            .emit(&self.dom)
            .map_err(|e| crate::Error::HtmlCodegen(e.to_string()))?;
        buf.into_string()
            .map_err(|_| crate::Error::HtmlCodegen("Invalid UTF-8 in generated HTML".to_string()))
    }

    /// Creates a builder for constructing a Parser with custom options.
    pub fn builder() -> ParserBuilder {
        ParserBuilder::default()
    }
}

/// Builder for constructing a Parser with custom options.
#[derive(Default)]
pub struct ParserBuilder {
    html: Option<String>,
    hook: Option<Arc<dyn ParserHook>>,
    enable_id_generation: bool,
}

impl ParserBuilder {
    /// Sets the HTML content to parse.
    pub fn with_html(mut self, html: impl Into<String>) -> Self {
        self.html = Some(html.into());
        self
    }

    /// Sets a custom hook for the parser.
    pub fn with_hook(mut self, hook: Arc<dyn ParserHook>) -> Self {
        self.hook = Some(hook);
        self
    }

    /// Enables or disables ID generation for AST elements.
    pub fn with_id_generation(mut self, enable: bool) -> Self {
        self.enable_id_generation = enable;
        self
    }

    /// Builds the Parser with the configured options.
    ///
    /// # Errors
    ///
    /// This function will return an error if HTML parsing fails or if no HTML was provided.
    pub fn build(self) -> Result<Parser, crate::Error> {
        let html = self.html.ok_or_else(|| {
            std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "HTML content not provided",
            )
        })?;

        let mut parser = Parser::from_html(html)?;
        if let Some(h) = self.hook {
            parser = parser.with_hook(h);
        }
        parser = parser.with_id_generation(self.enable_id_generation);
        Ok(parser)
    }
}