entrenar/research/
literate.rs1use regex::Regex;
6use serde::{Deserialize, Serialize};
7use std::sync::LazyLock;
8
9static TYPST_CODE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
11 Regex::new(r"```(\w*)\n([\s\S]*?)```").expect("Invalid Typst code block regex")
12});
13
14static MARKDOWN_CODE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
16 Regex::new(r"```(\w*)\n([\s\S]*?)```").expect("Invalid Markdown code block regex")
17});
18
19#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21pub struct CodeBlock {
22 pub language: Option<String>,
24 pub content: String,
26 pub line_number: usize,
28}
29
30impl CodeBlock {
31 pub fn new(content: impl Into<String>, line_number: usize) -> Self {
33 Self { language: None, content: content.into(), line_number }
34 }
35
36 pub fn with_language(mut self, language: impl Into<String>) -> Self {
38 let lang = language.into();
39 self.language = if lang.is_empty() { None } else { Some(lang) };
40 self
41 }
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
46pub enum LiterateDocument {
47 Typst(String),
49 Markdown(String),
51 RawText(String),
53}
54
55impl LiterateDocument {
56 pub fn parse_typst(content: impl Into<String>) -> Self {
58 Self::Typst(content.into())
59 }
60
61 pub fn parse_markdown(content: impl Into<String>) -> Self {
63 Self::Markdown(content.into())
64 }
65
66 pub fn raw(content: impl Into<String>) -> Self {
68 Self::RawText(content.into())
69 }
70
71 pub fn content(&self) -> &str {
73 match self {
74 Self::Typst(s) | Self::Markdown(s) | Self::RawText(s) => s,
75 }
76 }
77
78 pub fn extract_code_blocks(&self) -> Vec<CodeBlock> {
80 match self {
81 Self::Typst(content) => extract_blocks_with_regex(content, &TYPST_CODE_BLOCK),
82 Self::Markdown(content) => extract_blocks_with_regex(content, &MARKDOWN_CODE_BLOCK),
83 Self::RawText(_) => Vec::new(),
84 }
85 }
86
87 pub fn to_html(&self) -> String {
89 match self {
90 Self::Typst(content) | Self::Markdown(content) => {
91 let mut html = String::new();
92 html.push_str("<!DOCTYPE html>\n<html>\n<head>\n");
93 html.push_str("<meta charset=\"utf-8\">\n");
94 html.push_str("<style>\n");
95 html.push_str("body { font-family: system-ui, sans-serif; max-width: 800px; margin: 0 auto; padding: 2rem; }\n");
96 html.push_str("pre { background: #f5f5f5; padding: 1rem; overflow-x: auto; }\n");
97 html.push_str("code { font-family: monospace; }\n");
98 html.push_str("</style>\n</head>\n<body>\n");
99
100 let mut in_code_block = false;
102 let mut code_lang = String::new();
103 let mut code_content = String::new();
104
105 for line in content.lines() {
106 if line.starts_with("```") {
107 if in_code_block {
108 html.push_str("<pre><code");
110 if !code_lang.is_empty() {
111 html.push_str(&format!(" class=\"language-{code_lang}\""));
112 }
113 html.push('>');
114 html.push_str(&escape_html(&code_content));
115 html.push_str("</code></pre>\n");
116 code_content.clear();
117 code_lang.clear();
118 in_code_block = false;
119 } else {
120 code_lang = line.trim_start_matches('`').to_string();
122 in_code_block = true;
123 }
124 } else if in_code_block {
125 if !code_content.is_empty() {
126 code_content.push('\n');
127 }
128 code_content.push_str(line);
129 } else if line.starts_with('#') {
130 let level = line.chars().take_while(|&c| c == '#').count().min(6);
132 let text = line.trim_start_matches('#').trim();
133 html.push_str(&format!("<h{level}>{}</h{level}>\n", escape_html(text)));
134 } else if line.is_empty() {
135 } else {
137 html.push_str(&format!("<p>{}</p>\n", escape_html(line)));
139 }
140 }
141
142 html.push_str("</body>\n</html>");
143 html
144 }
145 Self::RawText(content) => {
146 format!(
147 "<!DOCTYPE html>\n<html>\n<body>\n<pre>{}</pre>\n</body>\n</html>",
148 escape_html(content)
149 )
150 }
151 }
152 }
153
154 pub fn is_typst(&self) -> bool {
156 matches!(self, Self::Typst(_))
157 }
158
159 pub fn is_markdown(&self) -> bool {
161 matches!(self, Self::Markdown(_))
162 }
163
164 pub fn is_raw(&self) -> bool {
166 matches!(self, Self::RawText(_))
167 }
168}
169
170fn extract_blocks_with_regex(content: &str, pattern: &Regex) -> Vec<CodeBlock> {
172 let mut blocks = Vec::new();
173
174 for cap in pattern.captures_iter(content) {
175 let full_match = cap.get(0).expect("capture group 0 always exists in a regex match");
176 let lang = cap.get(1).map(|m| m.as_str().to_string());
177 let code = cap.get(2).map(|m| m.as_str().to_string()).unwrap_or_default();
178
179 let line_number = content[..full_match.start()].chars().filter(|&c| c == '\n').count() + 1;
181
182 let mut block = CodeBlock::new(code.trim_end(), line_number);
183 if let Some(l) = lang {
184 block = block.with_language(l);
185 }
186 blocks.push(block);
187 }
188
189 blocks
190}
191
192fn escape_html(s: &str) -> String {
194 s.replace('&', "&")
195 .replace('<', "<")
196 .replace('>', ">")
197 .replace('"', """)
198 .replace('\'', "'")
199}
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204
205 #[test]
206 fn test_typst_parsing() {
207 let content = r#"
208= Introduction
209
210This is a Typst document.
211
212```rust
213fn main() {
214 println!("Hello, world!");
215}
216```
217
218More text here.
219"#;
220
221 let doc = LiterateDocument::parse_typst(content);
222 assert!(doc.is_typst());
223 assert!(doc.content().contains("Typst document"));
224 }
225
226 #[test]
227 fn test_code_block_extraction() {
228 let content = r#"
229# My Document
230
231Here's some code:
232
233```python
234def hello():
235 print("Hello!")
236```
237
238And more:
239
240```rust
241fn main() {}
242```
243"#;
244
245 let doc = LiterateDocument::parse_markdown(content);
246 let blocks = doc.extract_code_blocks();
247
248 assert_eq!(blocks.len(), 2);
249
250 assert_eq!(blocks[0].language, Some("python".to_string()));
251 assert!(blocks[0].content.contains("def hello()"));
252 assert_eq!(blocks[0].line_number, 6);
253
254 assert_eq!(blocks[1].language, Some("rust".to_string()));
255 assert!(blocks[1].content.contains("fn main()"));
256 assert_eq!(blocks[1].line_number, 13);
257 }
258
259 #[test]
260 fn test_code_block_no_language() {
261 let content = r"
262```
263plain code here
264```
265";
266
267 let doc = LiterateDocument::parse_markdown(content);
268 let blocks = doc.extract_code_blocks();
269
270 assert_eq!(blocks.len(), 1);
271 assert_eq!(blocks[0].language, None);
272 assert_eq!(blocks[0].content, "plain code here");
273 }
274
275 #[test]
276 fn test_markdown_passthrough() {
277 let content = "# Hello\n\nThis is markdown.";
278 let doc = LiterateDocument::parse_markdown(content);
279
280 assert!(doc.is_markdown());
281 assert_eq!(doc.content(), content);
282 }
283
284 #[test]
285 fn test_raw_text() {
286 let content = "Just plain text";
287 let doc = LiterateDocument::raw(content);
288
289 assert!(doc.is_raw());
290 assert_eq!(doc.content(), content);
291
292 let blocks = doc.extract_code_blocks();
294 assert!(blocks.is_empty());
295 }
296
297 #[test]
298 fn test_to_html_basic() {
299 let content = r"# Title
300
301This is a paragraph.
302
303```rust
304fn main() {}
305```
306";
307
308 let doc = LiterateDocument::parse_markdown(content);
309 let html = doc.to_html();
310
311 assert!(html.contains("<!DOCTYPE html>"));
312 assert!(html.contains("<h1>Title</h1>"));
313 assert!(html.contains("<p>This is a paragraph.</p>"));
314 assert!(html.contains("<pre><code class=\"language-rust\">"));
315 assert!(html.contains("fn main()"));
316 }
317
318 #[test]
319 fn test_to_html_escaping() {
320 let content = "This has <script>alert('xss')</script> in it.";
321 let doc = LiterateDocument::parse_markdown(content);
322 let html = doc.to_html();
323
324 assert!(!html.contains("<script>"));
325 assert!(html.contains("<script>"));
326 }
327
328 #[test]
329 fn test_raw_text_to_html() {
330 let content = "Line 1\nLine 2";
331 let doc = LiterateDocument::raw(content);
332 let html = doc.to_html();
333
334 assert!(html.contains("<pre>"));
335 assert!(html.contains("Line 1\nLine 2"));
336 }
337
338 #[test]
339 fn test_multiple_headings() {
340 let content = "# H1\n## H2\n### H3";
341 let doc = LiterateDocument::parse_markdown(content);
342 let html = doc.to_html();
343
344 assert!(html.contains("<h1>H1</h1>"));
345 assert!(html.contains("<h2>H2</h2>"));
346 assert!(html.contains("<h3>H3</h3>"));
347 }
348
349 #[test]
350 fn test_code_block_struct() {
351 let block = CodeBlock::new("let x = 1;", 10).with_language("rust");
352
353 assert_eq!(block.language, Some("rust".to_string()));
354 assert_eq!(block.content, "let x = 1;");
355 assert_eq!(block.line_number, 10);
356 }
357
358 #[test]
359 fn test_empty_language_becomes_none() {
360 let block = CodeBlock::new("code", 1).with_language("");
361 assert_eq!(block.language, None);
362 }
363
364 #[test]
365 fn test_typst_code_extraction() {
366 let content = r"
367= Typst Document
368
369#set text(size: 12pt)
370
371```python
372import numpy as np
373x = np.array([1, 2, 3])
374```
375
376More content here.
377";
378
379 let doc = LiterateDocument::parse_typst(content);
380 let blocks = doc.extract_code_blocks();
381
382 assert_eq!(blocks.len(), 1);
383 assert_eq!(blocks[0].language, Some("python".to_string()));
384 assert!(blocks[0].content.contains("import numpy"));
385 }
386}