ragit/index/file/
markdown.rs

1use super::{AtomicToken, FileReaderImpl, Image};
2use crate::error::Error;
3use crate::index::BuildConfig;
4use lazy_static::lazy_static;
5use ragit_fs::{FileError, exists, extension, join, parent, read_bytes};
6use ragit_pdl::ImageType;
7use regex::Regex;
8use std::collections::HashMap;
9use std::fs::File;
10use std::io::{BufRead, BufReader};
11
12lazy_static! {
13    static ref FENCE_RE: Regex = Regex::new(r"(\s*)(\`{3,}|\~{3,})([^`]*)").unwrap();
14    static ref DEF_RE: Regex = Regex::new(r"\s{0,3}\[([^\[\]]{1,999})\]\s?\:\s?(.+)").unwrap();
15    static ref WEB_URL_RE: Regex = Regex::new(r"[a-zA-Z]+\:\/\/.+\/.+").unwrap();
16}
17
18pub struct MarkdownReader {
19    path: String,
20    root_dir: String,
21    lines: BufReader<File>,
22    tokens: Vec<AtomicToken>,
23    is_exhausted: bool,
24    strict_mode: bool,
25    curr_parse_state: ParseState,
26    link_reference_definitions: HashMap<String, String>,
27}
28
29impl FileReaderImpl for MarkdownReader {
30    fn new(
31        path: &str,
32        root_dir: &str,
33        config: &BuildConfig,
34    ) -> Result<Self, Error> {
35        match File::open(path) {
36            Ok(f) => Ok(MarkdownReader {
37                path: path.to_string(),
38                root_dir: root_dir.to_string(),
39                lines: BufReader::new(f),
40                tokens: vec![],
41                is_exhausted: false,
42                strict_mode: config.strict_file_reader,
43                curr_parse_state: ParseState::Paragraph,
44                link_reference_definitions: HashMap::new(),
45            }),
46            Err(e) => Err(FileError::from_std(e, path).into()),
47        }
48    }
49
50    fn load_tokens(&mut self) -> Result<(), Error> {
51        if self.is_exhausted {
52            return Ok(());
53        }
54
55        let mut buffer = vec![];
56
57        loop {
58            // NOTE: `line` includes a newline character
59            let mut line = String::new();
60
61            if self.lines.read_line(&mut line)? == 0 {
62                self.is_exhausted = true;
63                self.consume_buffer(buffer)?;
64                break;
65            }
66
67            match &self.curr_parse_state {
68                ParseState::Paragraph => match parse_code_fence(&line) {
69                    Some(fence) => {
70                        self.curr_parse_state = ParseState::CodeFence(fence);
71                    },
72                    None => {
73                        if let Some((label, destination)) = parse_link_reference_definition(&line) {
74                            self.link_reference_definitions.insert(label, destination);
75                            continue;
76                        }
77
78                        for token in parse_markdown_images(&line)? {
79                            buffer.push(token);
80                        }
81
82                        continue;
83                    },
84                },
85                ParseState::CodeFence(fence) => match parse_code_fence(&line) {
86                    Some(fence2) => {
87                        if match_fences(fence, &fence2) {
88                            self.curr_parse_state = ParseState::Paragraph;
89                        }
90                    },
91                    None => {},
92                },
93            }
94
95            buffer.push(StringOrImage::String(line));
96
97            if buffer.len() > 16 && !has_unknown_link_reference(&self.link_reference_definitions, &buffer) {
98                self.consume_buffer(buffer)?;
99                break;
100            }
101        }
102
103        Ok(())
104    }
105
106    fn pop_all_tokens(&mut self) -> Result<Vec<AtomicToken>, Error> {
107        let mut result = vec![];
108        std::mem::swap(&mut self.tokens, &mut result);
109        Ok(result)
110    }
111
112    fn has_more_to_read(&self) -> bool {
113        !self.is_exhausted
114    }
115
116    fn key(&self) -> String {
117        String::from("markdown_reader_v0")
118    }
119}
120
121impl MarkdownReader {
122    fn consume_buffer(&mut self, buffer: Vec<StringOrImage>) -> Result<(), Error> {
123        for token in buffer.into_iter() {
124            match token {
125                StringOrImage::String(s) => {
126                    self.tokens.push(AtomicToken::String {
127                        char_len: s.chars().count(),
128                        data: s,
129                    });
130                },
131                _ => {
132                    let (desc, mut url) = match token {
133                        StringOrImage::ImageUrl { desc, url } => (desc, url),
134                        StringOrImage::ImageRef { desc, r#ref } => match self.link_reference_definitions.get(&r#ref) {
135                            Some(url) => (desc, url.to_string()),
136                            _ => {
137                                if self.strict_mode {
138                                    return Err(Error::FileReaderError(format!("Cannot find image link reference: {ref:?}")));
139                                }
140
141                                let fallback = format!("![{desc}][{ref}]");
142                                self.tokens.push(AtomicToken::String {
143                                    char_len: fallback.chars().count(),
144                                    data: fallback,
145                                });
146                                continue;
147                            },
148                        },
149                        _ => unreachable!(),
150                    };
151
152                    if WEB_URL_RE.is_match(&url) {
153                        self.tokens.push(AtomicToken::WebImage { subst: format!("![{desc}]({url})"), url: url.to_string() });
154                        continue;
155                    }
156
157                    else if !exists(&url) {
158                        // Absolute path: root dir of the repository, not the root of the file system
159                        if url.starts_with("/") {
160                            url = join(&self.root_dir, &format!(".{}", &url))?;
161                        }
162
163                        // Relative path: relative to the markdown file
164                        else {
165                            url = join(&parent(&self.path)?, &url)?;
166                        }
167                    }
168
169                    let image = match load_image_token(&url) {
170                        Ok(image) => image,
171                        Err(e) => if self.strict_mode {
172                            return Err(e.into());
173                        } else {
174                            let fallback = format!("![{desc}]({url})");
175                            self.tokens.push(AtomicToken::String {
176                                data: fallback.clone(),
177                                char_len: fallback.chars().count(),
178                            });
179                            continue;
180                        },
181                    };
182                    self.tokens.push(image);
183                },
184            }
185        }
186
187        Ok(())
188    }
189}
190
191enum ParseState {
192    Paragraph,
193    CodeFence(CodeFence),
194}
195
196struct CodeFence {
197    fence_char: u8,  // ` or ~
198    fence_len: usize,
199    info_string: Option<String>,
200    indent: usize,
201}
202
203#[derive(Clone, Debug)]
204enum StringOrImage {
205    String(String),
206    ImageUrl { desc: String, url: String },    // ![desc](url)
207    ImageRef { desc: String, r#ref: String },  // ![ref] or ![desc][ref]
208}
209
210// https://github.github.com/gfm/#fenced-code-blocks
211fn parse_code_fence(line: &str) -> Option<CodeFence> {
212    FENCE_RE.captures(line).map(
213        |cap| {
214            let indent = cap[1].len();
215            let fence = cap[2].to_string();
216            let info_string = cap[3].trim().to_string();
217
218            CodeFence {
219                fence_char: fence.as_bytes()[0],
220                fence_len: fence.len(),
221                info_string: if info_string.is_empty() { None } else { Some(info_string) },
222                indent,
223            }
224        }
225    )
226}
227
228fn match_fences(start: &CodeFence, end: &CodeFence) -> bool {
229    start.fence_char == end.fence_char &&
230    start.fence_len <= end.fence_len &&
231    end.indent < 4 &&
232    end.info_string.is_none()
233}
234
235// https://github.github.com/gfm/#link-reference-definition
236// TODO: it cannot handle multi-line link reference definitions
237fn parse_link_reference_definition(line: &str) -> Option<(String, String)> {
238    let result = DEF_RE.captures(line).map(
239        |cap| (
240            normalize_link_label(&cap[1]),
241            cap[2].trim().to_string(),
242        )
243    );
244
245    if let Some((label, _)) = &result {
246        if label.is_empty() { return None; }
247    }
248
249    result
250}
251
252fn normalize_link_label(label: &str) -> String {
253    let label = label.trim().to_lowercase();
254    let label = label.replace("\n", " ");
255    let label = label.replace("\t", " ");
256    let label = label.replace("\r", " ");
257    let mut label = label.replace("  ", " ");
258
259    while label.contains("  ") {
260        label = label.replace("  ", " ");
261    }
262
263    label
264}
265
266fn has_unknown_link_reference(
267    link_reference_definitions: &HashMap<String, String>,
268    buffer: &[StringOrImage],
269) -> bool {
270    for token in buffer.iter() {
271        if let StringOrImage::ImageRef { r#ref, .. } = token {
272            if !link_reference_definitions.contains_key(r#ref) {
273                return true;
274            }
275        }
276    }
277
278    false
279}
280
281fn parse_markdown_images(line: &str) -> Result<Vec<StringOrImage>, Error> {
282    let chars = line.chars().collect::<Vec<_>>();
283    let mut index = 0;
284    let mut last_index = 0;
285    let mut result = vec![];
286
287    while index < chars.len() {
288        if is_code_span_start(&chars, index) {
289            index = march_until_code_span_end(&chars, index);
290        }
291
292        else {
293            match try_parse_image(&chars, index) {
294                Some(image) => {
295                    if last_index < index {
296                        result.push(StringOrImage::String(chars[last_index..index].iter().collect()));
297                    }
298
299                    index = march_until_image_end(&chars, index);
300                    last_index = index;
301                    result.push(image);
302                },
303                None => {
304                    index += 1;
305                    index = march_until_important_char(&chars, index);
306                },
307            }
308        }
309    }
310
311    if last_index < index {
312        result.push(StringOrImage::String(chars[last_index..index].iter().collect()));
313    }
314
315    Ok(result)
316}
317
318fn is_code_span_start(chars: &[char], index: usize) -> bool {
319    matches!(chars.get(index), Some('`')) && chars.len() > index + 1 && chars[index..].iter().any(|c| *c != '`')
320}
321
322// It assumes that `is_code_span_start(chars, index)` is true.
323// It returns the index of the first character after the code span. -> one that comes after '`'
324// If the code span does not end (probably a markdown syntax error), it returns the index of the last character.
325fn march_until_code_span_end(chars: &[char], index: usize) -> usize {
326    let mut backtick_count = 0;
327    let original_len = chars.len();
328    let chars = &chars[index..];
329
330    for (i, c) in chars.iter().enumerate() {
331        if *c != '`' {
332            backtick_count = i;
333            break;
334        }
335    }
336
337    assert!(backtick_count != 0);
338
339    for i in 1..(chars.len() - backtick_count) {
340        if &chars[i..(i + backtick_count)] == &vec!['`'; backtick_count] {
341            return index + i + backtick_count;
342        }
343    }
344
345    return original_len - 1;
346}
347
348fn try_parse_image(chars: &[char], index: usize) -> Option<StringOrImage> {
349    match chars.get(index) {
350        Some('!') => match chars.get(index + 1) {
351            Some('[') => {},
352            _ => {
353                return None;
354            },
355        },
356        _ => {
357            return None;
358        },
359    }
360
361    let (bracket_content, index) = match get_matching_bracket_index(chars, index + 1) {
362        Some(new_index) => (chars[index + 2..new_index].iter().collect::<String>(), new_index),
363        None => {
364            return None;
365        },
366    };
367
368    match chars.get(index + 1) {
369        Some('[') => match get_matching_bracket_index(chars, index + 1) {
370            Some(new_index) => {
371                let r#ref = normalize_link_label(&chars[index + 2..new_index].iter().collect::<String>());
372
373                if r#ref.is_empty() {
374                    return None;
375                }
376
377                return Some(StringOrImage::ImageRef { desc: bracket_content, r#ref });
378            },
379            None => {},
380        },
381        Some('(') => match get_matching_bracket_index(chars, index + 1) {
382            Some(new_index) => {
383                return Some(StringOrImage::ImageUrl {
384                    desc: bracket_content,
385                    url: chars[index + 2..new_index].iter().collect::<String>(),
386                });
387            },
388            None => {},
389        },
390        _ => {},
391    }
392
393    let r#ref = normalize_link_label(&bracket_content);
394
395    if r#ref.is_empty() {
396        return None;
397    }
398
399    Some(StringOrImage::ImageRef { desc: String::new(), r#ref })
400}
401
402// It assumes that `try_parse_image(chars, index).is_some()`.
403fn march_until_image_end(chars: &[char], index: usize) -> usize {
404    let index = get_matching_bracket_index(chars, index + 1).unwrap();
405
406    match chars.get(index + 1) {
407        Some('[' | '(') => match get_matching_bracket_index(chars, index + 1) {
408            Some(index) => index + 1,
409            None => index + 1,
410        },
411        _ => index + 1,
412    }
413}
414
415fn march_until_important_char(chars: &[char], index: usize) -> usize {
416    for i in index.. {
417        match chars.get(i) {
418            Some(c) if *c == '`' || *c == '!' => {
419                return i;
420            },
421            None => {
422                return i;
423            },
424            _ => {},
425        }
426    }
427
428    unreachable!()
429}
430
431fn get_matching_bracket_index(chars: &[char], mut index: usize) -> Option<usize> {
432    let end = match chars.get(index) {
433        Some('[') => ']',
434        Some('(') => ')',
435        Some('{') => '}',
436        _ => {
437            return None;
438        },
439    };
440    index += 1;
441
442    loop {
443        match chars.get(index) {
444            Some(c) if *c == end => {
445                return Some(index);
446            },
447            Some('(' | '[' | '{') => match get_matching_bracket_index(chars, index) {
448                Some(new_index) => {
449                    index = new_index + 1;
450                },
451                _ => {
452                    return None;
453                },
454            },
455            Some(')' | ']' | '}') => {
456                return None;
457            },
458            None => {
459                return None;
460            },
461            _ => {
462                index += 1;
463            },
464        }
465    }
466}
467
468fn load_image_token(url: &str) -> Result<AtomicToken, Error> {
469    let bytes = read_bytes(url)?;
470    let image_type = ImageType::from_extension(&extension(&url).unwrap_or(Some(String::from("png"))).unwrap_or(String::from("png")))?;
471    Ok(AtomicToken::Image(Image::new(bytes, image_type)?))
472}
473
474#[cfg(test)]
475mod tests {
476    use super::super::{AtomicToken, FileReaderImpl};
477    use super::MarkdownReader;
478    use crate::index::BuildConfig;
479    use ragit_fs::{WriteMode, remove_file, write_string};
480
481    #[test]
482    fn markdown_test() {
483        let config_default = BuildConfig::default();
484        let mut config_strict = config_default.clone();
485        config_strict.strict_file_reader = true;
486        let md1 = "
487# Title
488
489This is a markdown file that has no image.
490
491![This is a broken image
492";
493        write_string("__tmp_test.md", md1, WriteMode::AlwaysCreate).unwrap();
494        let mut md_reader = MarkdownReader::new("__tmp_test.md", ".", &config_strict).unwrap();
495
496        while md_reader.has_more_to_read() {
497            md_reader.load_tokens().unwrap();
498        }
499
500        let md1_tokens = md_reader.pop_all_tokens().unwrap();
501        assert_eq!(
502            md1_tokens.iter().map(
503                |token| match token {
504                    AtomicToken::String { data, .. } => data.to_string(),
505                    _ => panic!(),
506                }
507            ).collect::<Vec<_>>().concat(),
508            md1.to_string(),
509        );
510        remove_file("__tmp_test.md").unwrap();
511    }
512}