Skip to main content

lex_babel/formats/html/
mod.rs

1//! HTML format implementation
2//!
3//! This module implements bidirectional conversion between Lex and HTML5.
4//!
5//! # Library Choice
6//!
7//! We use the `html5ever` + `rcdom` + `markup5ever` ecosystem for HTML parsing and serialization:
8//! - `html5ever`: Browser-grade HTML5 parser from the Servo project
9//! - `markup5ever_rcdom`: Reference-counted DOM tree implementation
10//! - `markup5ever`: Serialization infrastructure
11//!
12//! This choice is based on:
13//! - Complete solution for both parsing and serialization
14//! - Battle-tested with 12M+ downloads
15//! - WHATWG HTML5 specification compliance
16//! - Active maintenance by Servo project
17//! - Handles malformed HTML gracefully
18//!
19//! # Element Mapping Table
20//!
21//! Complete Lex ↔ HTML Mapping:
22//!
23//! | Lex Element      | HTML Equivalent                                    | Export Notes                              | Import Notes                          |
24//! |------------------|----------------------------------------------------|-------------------------------------------|---------------------------------------|
25//! | Document         | `<div class="lex-document">`                       | Root container with document class        | Parse body content                    |
26//! | Session          | `<section class="lex-session lex-session-N">` + `<hN>` | Session → section + heading        | section + heading → Session           |
27//! | Paragraph        | `<p class="lex-paragraph">`                        | Direct mapping with class                 | Direct mapping                        |
28//! | List             | `<ul>`/`<ol>` with `class="lex-list"`              | Ordered/unordered preserved with class    | Detect ul/ol type                     |
29//! | ListItem         | `<li class="lex-list-item">`                       | Direct mapping with class                 | Direct mapping                        |
30//! | Definition       | `<dl class="lex-definition">` `<dt>` `<dd>`        | Term in dt, description in dd             | Parse dl/dt/dd structure              |
31//! | Verbatim         | `<pre class="lex-verbatim">` `<code>`              | Language → data-language attribute        | Extract language from attribute       |
32//! | Annotation       | `<!-- lex:label key=val -->`                       | HTML comment format                       | Parse HTML comment pattern            |
33//! | InlineContent:   |                                                    |                                           |                                       |
34//! |   Text           | Plain text                                         | Direct                                    | Direct                                |
35//! |   Bold           | `<strong>`                                         | Semantic strong tag                       | Parse both strong and b               |
36//! |   Italic         | `<em>`                                             | Semantic emphasis tag                     | Parse both em and i                   |
37//! |   Code           | `<code>`                                           | Inline code tag                           | Direct                                |
38//! |   Math           | `<span class="lex-math">`                          | Preserve $ delimiters in span             | Parse math span                       |
39//! |   Reference      | `<a href="url">text</a>`                           | Convert to anchor with prev word as text  | Parse anchor back to reference        |
40//!
41//! # CSS Classes
42//!
43//! All Lex elements receive CSS classes matching their AST structure:
44//! - `.lex-document`: Root document container
45//! - `.lex-session`, `.lex-session-1`, `.lex-session-2`, etc.: Sessions with depth
46//! - `.lex-paragraph`: Paragraphs
47//! - `.lex-list`: Lists (combined with ul/ol)
48//! - `.lex-list-item`: List items
49//! - `.lex-definition`: Definition lists
50//! - `.lex-verbatim`: Verbatim/code blocks
51//! - `.lex-math`: Math expressions
52//!
53//! This enables:
54//! - Precise CSS targeting for presentation
55//! - Perfect round-trip conversion (HTML → Lex → HTML preserves structure)
56//! - Custom theming without modifying structure
57//!
58//! # CSS and Theming
59//!
60//! HTML export includes embedded CSS from:
61//! - `css/baseline.css`: Browser reset + default modern presentation (always included)
62//! - `css/themes/theme-*.css`: Optional overrides layered on top of the baseline
63//!
64//! The default theme (`HtmlTheme::Modern`) injects an empty stylesheet so the
65//! baseline alone controls rendering. Other themes, like Fancy Serif, only add
66//! targeted overrides.
67//!
68//! Themes use Google Fonts and are mobile-responsive.
69//!
70//! # Output Format
71//!
72//! Export produces a single, self-contained HTML file:
73//! - Complete HTML5 document structure
74//! - Embedded CSS in <style> tag
75//! - No external dependencies (except optionally-linked fonts)
76//! - Mobile-responsive viewport meta tag
77//!
78//! # Lossy Conversions
79//!
80//! The following conversions may lose information on round-trip:
81//! - Lex sessions beyond level 6 → h6 with nested sections (HTML heading limit)
82//! - Lex annotations → HTML comments (exported but parsing is lossy)
83//! - Some whitespace normalization
84//!
85//! # Architecture Notes
86//!
87//! Like the Markdown implementation, we handle the nested-to-flat conversion using the IR
88//! event system (lex-babel/src/common/). HTML is more naturally hierarchical than Markdown,
89//! but sessions still require special handling as they don't map directly to HTML's heading
90//! structure.
91//!
92//! We use semantic HTML elements with CSS classes for styling rather than presentational
93//! elements.
94//!
95//! # Implementation Status
96//!
97//! - [x] Export (Lex → HTML)
98//!   - [ ] Document structure with CSS embedding
99//!   - [ ] Paragraph
100//!   - [ ] Heading (Session) → section + heading
101//!   - [ ] Bold, Italic, Code inlines
102//!   - [ ] Lists - ordered/unordered
103//!   - [ ] Code blocks (Verbatim) with language attribute
104//!   - [ ] Definitions → dl/dt/dd
105//!   - [ ] Annotations → HTML comments
106//!   - [ ] Math → span with class
107//!   - [ ] References → anchors with link conversion
108//! - [ ] Import (HTML → Lex)
109//!   - [ ] All elements (to be implemented after export)
110
111mod serializer;
112
113use crate::error::FormatError;
114use crate::format::Format;
115use lex_core::lex::ast::Document;
116use std::fs;
117
118pub use serializer::HtmlOptions;
119
120/// Returns the default baseline CSS used for HTML export.
121///
122/// This is the same CSS embedded in all HTML exports when no custom CSS is provided.
123/// Use this to get a starting point for custom styling.
124pub fn get_default_css() -> &'static str {
125    include_str!("../../../css/baseline.css")
126}
127
128/// Format implementation for HTML
129pub struct HtmlFormat {
130    /// CSS theme to use for export
131    theme: HtmlTheme,
132}
133
134/// Available CSS themes for HTML export
135#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
136pub enum HtmlTheme {
137    /// Serif typography override (fonts only, layout comes from baseline)
138    FancySerif,
139    /// Baseline modern theme (no-op; relies on baseline.css)
140    #[default]
141    Modern,
142}
143
144impl Default for HtmlFormat {
145    fn default() -> Self {
146        Self::new(HtmlTheme::Modern)
147    }
148}
149
150impl HtmlFormat {
151    /// Create a new HTML format with the specified theme
152    pub fn new(theme: HtmlTheme) -> Self {
153        Self { theme }
154    }
155
156    /// Create HTML format with fancy serif theme
157    pub fn with_fancy_serif() -> Self {
158        Self::new(HtmlTheme::FancySerif)
159    }
160
161    /// Create HTML format with modern theme
162    pub fn with_modern() -> Self {
163        Self::new(HtmlTheme::Modern)
164    }
165}
166
167impl Format for HtmlFormat {
168    fn name(&self) -> &str {
169        "html"
170    }
171
172    fn description(&self) -> &str {
173        "HTML5 format with embedded CSS"
174    }
175
176    fn file_extensions(&self) -> &[&str] {
177        &["html", "htm"]
178    }
179
180    fn supports_parsing(&self) -> bool {
181        false // Implement after export is working
182    }
183
184    fn supports_serialization(&self) -> bool {
185        true
186    }
187
188    fn parse(&self, _source: &str) -> Result<Document, FormatError> {
189        Err(FormatError::NotSupported(
190            "HTML import not yet implemented".to_string(),
191        ))
192    }
193
194    fn serialize(&self, doc: &Document) -> Result<String, FormatError> {
195        serializer::serialize_to_html(doc, self.theme)
196    }
197
198    fn serialize_with_options(
199        &self,
200        doc: &Document,
201        options: &std::collections::HashMap<String, String>,
202    ) -> Result<crate::format::SerializedDocument, FormatError> {
203        let mut theme = self.theme;
204        if let Some(theme_str) = options.get("theme") {
205            theme = match theme_str.as_str() {
206                "fancy-serif" => HtmlTheme::FancySerif,
207                "modern" | "default" => HtmlTheme::Modern,
208                _ => {
209                    // Fallback to default for unknown themes, or could error.
210                    // For now, let's fallback to Modern to be safe.
211                    HtmlTheme::Modern
212                }
213            };
214        }
215
216        let mut html_options = HtmlOptions::new(theme);
217
218        // Handle custom CSS option (expects CSS content, not path)
219        if let Some(css_content) = options.get("custom_css") {
220            html_options = html_options.with_custom_css(css_content.clone());
221        } else if let Some(css_path) = options.get("css-path").or_else(|| options.get("css_path")) {
222            let css = fs::read_to_string(css_path).map_err(|err| {
223                FormatError::SerializationError(format!(
224                    "Failed to read CSS at '{}': {}",
225                    css_path, err
226                ))
227            })?;
228            html_options = html_options.with_custom_css(css);
229        }
230
231        serializer::serialize_to_html_with_options(doc, html_options)
232            .map(crate::format::SerializedDocument::Text)
233    }
234}
235
236#[cfg(test)]
237mod tests {
238    use super::*;
239    use crate::format::SerializedDocument;
240    use lex_core::lex::ast::Document;
241    use std::collections::HashMap;
242    use std::io::Write;
243    use tempfile::NamedTempFile;
244
245    #[test]
246    fn test_get_default_css_returns_baseline() {
247        let css = get_default_css();
248        // Should contain key selectors from baseline.css
249        assert!(css.contains(".lex-document"));
250        assert!(css.contains(".lex-paragraph"));
251        assert!(css.contains(".lex-session"));
252        // Should be non-trivial content
253        assert!(css.len() > 1000);
254    }
255
256    #[test]
257    fn test_get_default_css_is_same_as_embedded() {
258        // The CSS returned should be the exact same as what's embedded in HTML output
259        let css = get_default_css();
260        // Verify it's the actual include_str content by checking for CSS custom properties
261        assert!(css.contains("--lex-"));
262    }
263
264    #[test]
265    fn test_css_path_option_loads_file() {
266        let mut temp = NamedTempFile::new().expect("failed to create temp file");
267        writeln!(temp, ".from-path {{ color: blue; }}").expect("failed to write temp css");
268
269        let doc = Document::new();
270        let format = HtmlFormat::default();
271        let mut options = HashMap::new();
272        options.insert(
273            "css-path".to_string(),
274            temp.path().to_string_lossy().to_string(),
275        );
276
277        let html = format
278            .serialize_with_options(&doc, &options)
279            .expect("html export should succeed");
280
281        let SerializedDocument::Text(content) = html else {
282            panic!("expected text html output");
283        };
284        assert!(content.contains(".from-path { color: blue; }"));
285    }
286
287    #[test]
288    fn test_css_path_option_errors_on_missing_file() {
289        let doc = Document::new();
290        let format = HtmlFormat::default();
291        let mut options = HashMap::new();
292        options.insert("css-path".to_string(), "/no/such/file.css".to_string());
293
294        let err = match format.serialize_with_options(&doc, &options) {
295            Ok(_) => panic!("expected css-path lookup to fail"),
296            Err(err) => err,
297        };
298
299        match err {
300            FormatError::SerializationError(msg) => {
301                assert!(msg.contains("/no/such/file.css"));
302            }
303            other => panic!("expected serialization error, got {other:?}"),
304        }
305    }
306}