Skip to main content

entrenar/research/
literate.rs

1//! Literate Document with Typst support (ENT-021)
2//!
3//! Provides literate programming document support with code block extraction.
4
5use regex::Regex;
6use serde::{Deserialize, Serialize};
7use std::sync::LazyLock;
8
9/// Regex for extracting code blocks from Typst documents
10static TYPST_CODE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
11    Regex::new(r"```(\w*)\n([\s\S]*?)```").expect("Invalid Typst code block regex")
12});
13
14/// Regex for extracting code blocks from Markdown
15static MARKDOWN_CODE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
16    Regex::new(r"```(\w*)\n([\s\S]*?)```").expect("Invalid Markdown code block regex")
17});
18
19/// A code block extracted from a literate document
20#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21pub struct CodeBlock {
22    /// Programming language (if specified)
23    pub language: Option<String>,
24    /// Code content
25    pub content: String,
26    /// Line number where block starts (1-indexed)
27    pub line_number: usize,
28}
29
30impl CodeBlock {
31    /// Create a new code block
32    pub fn new(content: impl Into<String>, line_number: usize) -> Self {
33        Self { language: None, content: content.into(), line_number }
34    }
35
36    /// Set the language
37    pub fn with_language(mut self, language: impl Into<String>) -> Self {
38        let lang = language.into();
39        self.language = if lang.is_empty() { None } else { Some(lang) };
40        self
41    }
42}
43
44/// Literate document types
45#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
46pub enum LiterateDocument {
47    /// Typst document
48    Typst(String),
49    /// Markdown document
50    Markdown(String),
51    /// Raw text (no special parsing)
52    RawText(String),
53}
54
55impl LiterateDocument {
56    /// Parse a Typst document from string
57    pub fn parse_typst(content: impl Into<String>) -> Self {
58        Self::Typst(content.into())
59    }
60
61    /// Parse a Markdown document from string
62    pub fn parse_markdown(content: impl Into<String>) -> Self {
63        Self::Markdown(content.into())
64    }
65
66    /// Create a raw text document
67    pub fn raw(content: impl Into<String>) -> Self {
68        Self::RawText(content.into())
69    }
70
71    /// Get the raw content
72    pub fn content(&self) -> &str {
73        match self {
74            Self::Typst(s) | Self::Markdown(s) | Self::RawText(s) => s,
75        }
76    }
77
78    /// Extract code blocks from the document
79    pub fn extract_code_blocks(&self) -> Vec<CodeBlock> {
80        match self {
81            Self::Typst(content) => extract_blocks_with_regex(content, &TYPST_CODE_BLOCK),
82            Self::Markdown(content) => extract_blocks_with_regex(content, &MARKDOWN_CODE_BLOCK),
83            Self::RawText(_) => Vec::new(),
84        }
85    }
86
87    /// Convert to basic HTML representation
88    pub fn to_html(&self) -> String {
89        match self {
90            Self::Typst(content) | Self::Markdown(content) => {
91                let mut html = String::new();
92                html.push_str("<!DOCTYPE html>\n<html>\n<head>\n");
93                html.push_str("<meta charset=\"utf-8\">\n");
94                html.push_str("<style>\n");
95                html.push_str("body { font-family: system-ui, sans-serif; max-width: 800px; margin: 0 auto; padding: 2rem; }\n");
96                html.push_str("pre { background: #f5f5f5; padding: 1rem; overflow-x: auto; }\n");
97                html.push_str("code { font-family: monospace; }\n");
98                html.push_str("</style>\n</head>\n<body>\n");
99
100                // Simple conversion: paragraphs and code blocks
101                let mut in_code_block = false;
102                let mut code_lang = String::new();
103                let mut code_content = String::new();
104
105                for line in content.lines() {
106                    if line.starts_with("```") {
107                        if in_code_block {
108                            // End code block
109                            html.push_str("<pre><code");
110                            if !code_lang.is_empty() {
111                                html.push_str(&format!(" class=\"language-{code_lang}\""));
112                            }
113                            html.push('>');
114                            html.push_str(&escape_html(&code_content));
115                            html.push_str("</code></pre>\n");
116                            code_content.clear();
117                            code_lang.clear();
118                            in_code_block = false;
119                        } else {
120                            // Start code block
121                            code_lang = line.trim_start_matches('`').to_string();
122                            in_code_block = true;
123                        }
124                    } else if in_code_block {
125                        if !code_content.is_empty() {
126                            code_content.push('\n');
127                        }
128                        code_content.push_str(line);
129                    } else if line.starts_with('#') {
130                        // Heading
131                        let level = line.chars().take_while(|&c| c == '#').count().min(6);
132                        let text = line.trim_start_matches('#').trim();
133                        html.push_str(&format!("<h{level}>{}</h{level}>\n", escape_html(text)));
134                    } else if line.is_empty() {
135                        // Empty line
136                    } else {
137                        // Paragraph
138                        html.push_str(&format!("<p>{}</p>\n", escape_html(line)));
139                    }
140                }
141
142                html.push_str("</body>\n</html>");
143                html
144            }
145            Self::RawText(content) => {
146                format!(
147                    "<!DOCTYPE html>\n<html>\n<body>\n<pre>{}</pre>\n</body>\n</html>",
148                    escape_html(content)
149                )
150            }
151        }
152    }
153
154    /// Check if this is a Typst document
155    pub fn is_typst(&self) -> bool {
156        matches!(self, Self::Typst(_))
157    }
158
159    /// Check if this is a Markdown document
160    pub fn is_markdown(&self) -> bool {
161        matches!(self, Self::Markdown(_))
162    }
163
164    /// Check if this is raw text
165    pub fn is_raw(&self) -> bool {
166        matches!(self, Self::RawText(_))
167    }
168}
169
170/// Extract code blocks using a regex pattern
171fn extract_blocks_with_regex(content: &str, pattern: &Regex) -> Vec<CodeBlock> {
172    let mut blocks = Vec::new();
173
174    for cap in pattern.captures_iter(content) {
175        let full_match = cap.get(0).expect("capture group 0 always exists in a regex match");
176        let lang = cap.get(1).map(|m| m.as_str().to_string());
177        let code = cap.get(2).map(|m| m.as_str().to_string()).unwrap_or_default();
178
179        // Calculate line number
180        let line_number = content[..full_match.start()].chars().filter(|&c| c == '\n').count() + 1;
181
182        let mut block = CodeBlock::new(code.trim_end(), line_number);
183        if let Some(l) = lang {
184            block = block.with_language(l);
185        }
186        blocks.push(block);
187    }
188
189    blocks
190}
191
192/// Escape HTML special characters
193fn escape_html(s: &str) -> String {
194    s.replace('&', "&amp;")
195        .replace('<', "&lt;")
196        .replace('>', "&gt;")
197        .replace('"', "&quot;")
198        .replace('\'', "&#39;")
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn test_typst_parsing() {
207        let content = r#"
208= Introduction
209
210This is a Typst document.
211
212```rust
213fn main() {
214    println!("Hello, world!");
215}
216```
217
218More text here.
219"#;
220
221        let doc = LiterateDocument::parse_typst(content);
222        assert!(doc.is_typst());
223        assert!(doc.content().contains("Typst document"));
224    }
225
226    #[test]
227    fn test_code_block_extraction() {
228        let content = r#"
229# My Document
230
231Here's some code:
232
233```python
234def hello():
235    print("Hello!")
236```
237
238And more:
239
240```rust
241fn main() {}
242```
243"#;
244
245        let doc = LiterateDocument::parse_markdown(content);
246        let blocks = doc.extract_code_blocks();
247
248        assert_eq!(blocks.len(), 2);
249
250        assert_eq!(blocks[0].language, Some("python".to_string()));
251        assert!(blocks[0].content.contains("def hello()"));
252        assert_eq!(blocks[0].line_number, 6);
253
254        assert_eq!(blocks[1].language, Some("rust".to_string()));
255        assert!(blocks[1].content.contains("fn main()"));
256        assert_eq!(blocks[1].line_number, 13);
257    }
258
259    #[test]
260    fn test_code_block_no_language() {
261        let content = r"
262```
263plain code here
264```
265";
266
267        let doc = LiterateDocument::parse_markdown(content);
268        let blocks = doc.extract_code_blocks();
269
270        assert_eq!(blocks.len(), 1);
271        assert_eq!(blocks[0].language, None);
272        assert_eq!(blocks[0].content, "plain code here");
273    }
274
275    #[test]
276    fn test_markdown_passthrough() {
277        let content = "# Hello\n\nThis is markdown.";
278        let doc = LiterateDocument::parse_markdown(content);
279
280        assert!(doc.is_markdown());
281        assert_eq!(doc.content(), content);
282    }
283
284    #[test]
285    fn test_raw_text() {
286        let content = "Just plain text";
287        let doc = LiterateDocument::raw(content);
288
289        assert!(doc.is_raw());
290        assert_eq!(doc.content(), content);
291
292        // Raw text should have no code blocks
293        let blocks = doc.extract_code_blocks();
294        assert!(blocks.is_empty());
295    }
296
297    #[test]
298    fn test_to_html_basic() {
299        let content = r"# Title
300
301This is a paragraph.
302
303```rust
304fn main() {}
305```
306";
307
308        let doc = LiterateDocument::parse_markdown(content);
309        let html = doc.to_html();
310
311        assert!(html.contains("<!DOCTYPE html>"));
312        assert!(html.contains("<h1>Title</h1>"));
313        assert!(html.contains("<p>This is a paragraph.</p>"));
314        assert!(html.contains("<pre><code class=\"language-rust\">"));
315        assert!(html.contains("fn main()"));
316    }
317
318    #[test]
319    fn test_to_html_escaping() {
320        let content = "This has <script>alert('xss')</script> in it.";
321        let doc = LiterateDocument::parse_markdown(content);
322        let html = doc.to_html();
323
324        assert!(!html.contains("<script>"));
325        assert!(html.contains("&lt;script&gt;"));
326    }
327
328    #[test]
329    fn test_raw_text_to_html() {
330        let content = "Line 1\nLine 2";
331        let doc = LiterateDocument::raw(content);
332        let html = doc.to_html();
333
334        assert!(html.contains("<pre>"));
335        assert!(html.contains("Line 1\nLine 2"));
336    }
337
338    #[test]
339    fn test_multiple_headings() {
340        let content = "# H1\n## H2\n### H3";
341        let doc = LiterateDocument::parse_markdown(content);
342        let html = doc.to_html();
343
344        assert!(html.contains("<h1>H1</h1>"));
345        assert!(html.contains("<h2>H2</h2>"));
346        assert!(html.contains("<h3>H3</h3>"));
347    }
348
349    #[test]
350    fn test_code_block_struct() {
351        let block = CodeBlock::new("let x = 1;", 10).with_language("rust");
352
353        assert_eq!(block.language, Some("rust".to_string()));
354        assert_eq!(block.content, "let x = 1;");
355        assert_eq!(block.line_number, 10);
356    }
357
358    #[test]
359    fn test_empty_language_becomes_none() {
360        let block = CodeBlock::new("code", 1).with_language("");
361        assert_eq!(block.language, None);
362    }
363
364    #[test]
365    fn test_typst_code_extraction() {
366        let content = r"
367= Typst Document
368
369#set text(size: 12pt)
370
371```python
372import numpy as np
373x = np.array([1, 2, 3])
374```
375
376More content here.
377";
378
379        let doc = LiterateDocument::parse_typst(content);
380        let blocks = doc.extract_code_blocks();
381
382        assert_eq!(blocks.len(), 1);
383        assert_eq!(blocks[0].language, Some("python".to_string()));
384        assert!(blocks[0].content.contains("import numpy"));
385    }
386}