1use super::{AtomicToken, FileReaderImpl, Image};
2use crate::error::Error;
3use crate::index::BuildConfig;
4use lazy_static::lazy_static;
5use ragit_fs::{FileError, exists, extension, join, parent, read_bytes};
6use ragit_pdl::ImageType;
7use regex::Regex;
8use std::collections::HashMap;
9use std::fs::File;
10use std::io::{BufRead, BufReader};
11
12lazy_static! {
13 static ref FENCE_RE: Regex = Regex::new(r"(\s*)(\`{3,}|\~{3,})([^`]*)").unwrap();
14 static ref DEF_RE: Regex = Regex::new(r"\s{0,3}\[([^\[\]]{1,999})\]\s?\:\s?(.+)").unwrap();
15 static ref WEB_URL_RE: Regex = Regex::new(r"[a-zA-Z]+\:\/\/.+\/.+").unwrap();
16}
17
18pub struct MarkdownReader {
19 path: String,
20 root_dir: String,
21 lines: BufReader<File>,
22 tokens: Vec<AtomicToken>,
23 is_exhausted: bool,
24 strict_mode: bool,
25 curr_parse_state: ParseState,
26 link_reference_definitions: HashMap<String, String>,
27}
28
29impl FileReaderImpl for MarkdownReader {
30 fn new(
31 path: &str,
32 root_dir: &str,
33 config: &BuildConfig,
34 ) -> Result<Self, Error> {
35 match File::open(path) {
36 Ok(f) => Ok(MarkdownReader {
37 path: path.to_string(),
38 root_dir: root_dir.to_string(),
39 lines: BufReader::new(f),
40 tokens: vec![],
41 is_exhausted: false,
42 strict_mode: config.strict_file_reader,
43 curr_parse_state: ParseState::Paragraph,
44 link_reference_definitions: HashMap::new(),
45 }),
46 Err(e) => Err(FileError::from_std(e, path).into()),
47 }
48 }
49
50 fn load_tokens(&mut self) -> Result<(), Error> {
51 if self.is_exhausted {
52 return Ok(());
53 }
54
55 let mut buffer = vec![];
56
57 loop {
58 let mut line = String::new();
60
61 if self.lines.read_line(&mut line)? == 0 {
62 self.is_exhausted = true;
63 self.consume_buffer(buffer)?;
64 break;
65 }
66
67 match &self.curr_parse_state {
68 ParseState::Paragraph => match parse_code_fence(&line) {
69 Some(fence) => {
70 self.curr_parse_state = ParseState::CodeFence(fence);
71 },
72 None => {
73 if let Some((label, destination)) = parse_link_reference_definition(&line) {
74 self.link_reference_definitions.insert(label, destination);
75 continue;
76 }
77
78 for token in parse_markdown_images(&line)? {
79 buffer.push(token);
80 }
81
82 continue;
83 },
84 },
85 ParseState::CodeFence(fence) => match parse_code_fence(&line) {
86 Some(fence2) => {
87 if match_fences(fence, &fence2) {
88 self.curr_parse_state = ParseState::Paragraph;
89 }
90 },
91 None => {},
92 },
93 }
94
95 buffer.push(StringOrImage::String(line));
96
97 if buffer.len() > 16 && !has_unknown_link_reference(&self.link_reference_definitions, &buffer) {
98 self.consume_buffer(buffer)?;
99 break;
100 }
101 }
102
103 Ok(())
104 }
105
106 fn pop_all_tokens(&mut self) -> Result<Vec<AtomicToken>, Error> {
107 let mut result = vec![];
108 std::mem::swap(&mut self.tokens, &mut result);
109 Ok(result)
110 }
111
112 fn has_more_to_read(&self) -> bool {
113 !self.is_exhausted
114 }
115
116 fn key(&self) -> String {
117 String::from("markdown_reader_v0")
118 }
119}
120
121impl MarkdownReader {
122 fn consume_buffer(&mut self, buffer: Vec<StringOrImage>) -> Result<(), Error> {
123 for token in buffer.into_iter() {
124 match token {
125 StringOrImage::String(s) => {
126 self.tokens.push(AtomicToken::String {
127 char_len: s.chars().count(),
128 data: s,
129 });
130 },
131 _ => {
132 let (desc, mut url) = match token {
133 StringOrImage::ImageUrl { desc, url } => (desc, url),
134 StringOrImage::ImageRef { desc, r#ref } => match self.link_reference_definitions.get(&r#ref) {
135 Some(url) => (desc, url.to_string()),
136 _ => {
137 if self.strict_mode {
138 return Err(Error::FileReaderError(format!("Cannot find image link reference: {ref:?}")));
139 }
140
141 let fallback = format!("![{desc}][{ref}]");
142 self.tokens.push(AtomicToken::String {
143 char_len: fallback.chars().count(),
144 data: fallback,
145 });
146 continue;
147 },
148 },
149 _ => unreachable!(),
150 };
151
152 if WEB_URL_RE.is_match(&url) {
153 self.tokens.push(AtomicToken::WebImage { subst: format!(""), url: url.to_string() });
154 continue;
155 }
156
157 else if !exists(&url) {
158 if url.starts_with("/") {
160 url = join(&self.root_dir, &format!(".{}", &url))?;
161 }
162
163 else {
165 url = join(&parent(&self.path)?, &url)?;
166 }
167 }
168
169 let image = match load_image_token(&url) {
170 Ok(image) => image,
171 Err(e) => if self.strict_mode {
172 return Err(e.into());
173 } else {
174 let fallback = format!("");
175 self.tokens.push(AtomicToken::String {
176 data: fallback.clone(),
177 char_len: fallback.chars().count(),
178 });
179 continue;
180 },
181 };
182 self.tokens.push(image);
183 },
184 }
185 }
186
187 Ok(())
188 }
189}
190
191enum ParseState {
192 Paragraph,
193 CodeFence(CodeFence),
194}
195
196struct CodeFence {
197 fence_char: u8, fence_len: usize,
199 info_string: Option<String>,
200 indent: usize,
201}
202
203#[derive(Clone, Debug)]
204enum StringOrImage {
205 String(String),
206 ImageUrl { desc: String, url: String }, ImageRef { desc: String, r#ref: String }, }
209
210fn parse_code_fence(line: &str) -> Option<CodeFence> {
212 FENCE_RE.captures(line).map(
213 |cap| {
214 let indent = cap[1].len();
215 let fence = cap[2].to_string();
216 let info_string = cap[3].trim().to_string();
217
218 CodeFence {
219 fence_char: fence.as_bytes()[0],
220 fence_len: fence.len(),
221 info_string: if info_string.is_empty() { None } else { Some(info_string) },
222 indent,
223 }
224 }
225 )
226}
227
228fn match_fences(start: &CodeFence, end: &CodeFence) -> bool {
229 start.fence_char == end.fence_char &&
230 start.fence_len <= end.fence_len &&
231 end.indent < 4 &&
232 end.info_string.is_none()
233}
234
235fn parse_link_reference_definition(line: &str) -> Option<(String, String)> {
238 let result = DEF_RE.captures(line).map(
239 |cap| (
240 normalize_link_label(&cap[1]),
241 cap[2].trim().to_string(),
242 )
243 );
244
245 if let Some((label, _)) = &result {
246 if label.is_empty() { return None; }
247 }
248
249 result
250}
251
252fn normalize_link_label(label: &str) -> String {
253 let label = label.trim().to_lowercase();
254 let label = label.replace("\n", " ");
255 let label = label.replace("\t", " ");
256 let label = label.replace("\r", " ");
257 let mut label = label.replace(" ", " ");
258
259 while label.contains(" ") {
260 label = label.replace(" ", " ");
261 }
262
263 label
264}
265
266fn has_unknown_link_reference(
267 link_reference_definitions: &HashMap<String, String>,
268 buffer: &[StringOrImage],
269) -> bool {
270 for token in buffer.iter() {
271 if let StringOrImage::ImageRef { r#ref, .. } = token {
272 if !link_reference_definitions.contains_key(r#ref) {
273 return true;
274 }
275 }
276 }
277
278 false
279}
280
281fn parse_markdown_images(line: &str) -> Result<Vec<StringOrImage>, Error> {
282 let chars = line.chars().collect::<Vec<_>>();
283 let mut index = 0;
284 let mut last_index = 0;
285 let mut result = vec![];
286
287 while index < chars.len() {
288 if is_code_span_start(&chars, index) {
289 index = march_until_code_span_end(&chars, index);
290 }
291
292 else {
293 match try_parse_image(&chars, index) {
294 Some(image) => {
295 if last_index < index {
296 result.push(StringOrImage::String(chars[last_index..index].iter().collect()));
297 }
298
299 index = march_until_image_end(&chars, index);
300 last_index = index;
301 result.push(image);
302 },
303 None => {
304 index += 1;
305 index = march_until_important_char(&chars, index);
306 },
307 }
308 }
309 }
310
311 if last_index < index {
312 result.push(StringOrImage::String(chars[last_index..index].iter().collect()));
313 }
314
315 Ok(result)
316}
317
318fn is_code_span_start(chars: &[char], index: usize) -> bool {
319 matches!(chars.get(index), Some('`')) && chars.len() > index + 1 && chars[index..].iter().any(|c| *c != '`')
320}
321
322fn march_until_code_span_end(chars: &[char], index: usize) -> usize {
326 let mut backtick_count = 0;
327 let original_len = chars.len();
328 let chars = &chars[index..];
329
330 for (i, c) in chars.iter().enumerate() {
331 if *c != '`' {
332 backtick_count = i;
333 break;
334 }
335 }
336
337 assert!(backtick_count != 0);
338
339 for i in 1..(chars.len() - backtick_count) {
340 if &chars[i..(i + backtick_count)] == &vec!['`'; backtick_count] {
341 return index + i + backtick_count;
342 }
343 }
344
345 return original_len - 1;
346}
347
348fn try_parse_image(chars: &[char], index: usize) -> Option<StringOrImage> {
349 match chars.get(index) {
350 Some('!') => match chars.get(index + 1) {
351 Some('[') => {},
352 _ => {
353 return None;
354 },
355 },
356 _ => {
357 return None;
358 },
359 }
360
361 let (bracket_content, index) = match get_matching_bracket_index(chars, index + 1) {
362 Some(new_index) => (chars[index + 2..new_index].iter().collect::<String>(), new_index),
363 None => {
364 return None;
365 },
366 };
367
368 match chars.get(index + 1) {
369 Some('[') => match get_matching_bracket_index(chars, index + 1) {
370 Some(new_index) => {
371 let r#ref = normalize_link_label(&chars[index + 2..new_index].iter().collect::<String>());
372
373 if r#ref.is_empty() {
374 return None;
375 }
376
377 return Some(StringOrImage::ImageRef { desc: bracket_content, r#ref });
378 },
379 None => {},
380 },
381 Some('(') => match get_matching_bracket_index(chars, index + 1) {
382 Some(new_index) => {
383 return Some(StringOrImage::ImageUrl {
384 desc: bracket_content,
385 url: chars[index + 2..new_index].iter().collect::<String>(),
386 });
387 },
388 None => {},
389 },
390 _ => {},
391 }
392
393 let r#ref = normalize_link_label(&bracket_content);
394
395 if r#ref.is_empty() {
396 return None;
397 }
398
399 Some(StringOrImage::ImageRef { desc: String::new(), r#ref })
400}
401
402fn march_until_image_end(chars: &[char], index: usize) -> usize {
404 let index = get_matching_bracket_index(chars, index + 1).unwrap();
405
406 match chars.get(index + 1) {
407 Some('[' | '(') => match get_matching_bracket_index(chars, index + 1) {
408 Some(index) => index + 1,
409 None => index + 1,
410 },
411 _ => index + 1,
412 }
413}
414
415fn march_until_important_char(chars: &[char], index: usize) -> usize {
416 for i in index.. {
417 match chars.get(i) {
418 Some(c) if *c == '`' || *c == '!' => {
419 return i;
420 },
421 None => {
422 return i;
423 },
424 _ => {},
425 }
426 }
427
428 unreachable!()
429}
430
431fn get_matching_bracket_index(chars: &[char], mut index: usize) -> Option<usize> {
432 let end = match chars.get(index) {
433 Some('[') => ']',
434 Some('(') => ')',
435 Some('{') => '}',
436 _ => {
437 return None;
438 },
439 };
440 index += 1;
441
442 loop {
443 match chars.get(index) {
444 Some(c) if *c == end => {
445 return Some(index);
446 },
447 Some('(' | '[' | '{') => match get_matching_bracket_index(chars, index) {
448 Some(new_index) => {
449 index = new_index + 1;
450 },
451 _ => {
452 return None;
453 },
454 },
455 Some(')' | ']' | '}') => {
456 return None;
457 },
458 None => {
459 return None;
460 },
461 _ => {
462 index += 1;
463 },
464 }
465 }
466}
467
468fn load_image_token(url: &str) -> Result<AtomicToken, Error> {
469 let bytes = read_bytes(url)?;
470 let image_type = ImageType::from_extension(&extension(&url).unwrap_or(Some(String::from("png"))).unwrap_or(String::from("png")))?;
471 Ok(AtomicToken::Image(Image::new(bytes, image_type)?))
472}
473
474#[cfg(test)]
475mod tests {
476 use super::super::{AtomicToken, FileReaderImpl};
477 use super::MarkdownReader;
478 use crate::index::BuildConfig;
479 use ragit_fs::{WriteMode, remove_file, write_string};
480
481 #[test]
482 fn markdown_test() {
483 let config_default = BuildConfig::default();
484 let mut config_strict = config_default.clone();
485 config_strict.strict_file_reader = true;
486 let md1 = "
487# Title
488
489This is a markdown file that has no image.
490
491![This is a broken image
492";
493 write_string("__tmp_test.md", md1, WriteMode::AlwaysCreate).unwrap();
494 let mut md_reader = MarkdownReader::new("__tmp_test.md", ".", &config_strict).unwrap();
495
496 while md_reader.has_more_to_read() {
497 md_reader.load_tokens().unwrap();
498 }
499
500 let md1_tokens = md_reader.pop_all_tokens().unwrap();
501 assert_eq!(
502 md1_tokens.iter().map(
503 |token| match token {
504 AtomicToken::String { data, .. } => data.to_string(),
505 _ => panic!(),
506 }
507 ).collect::<Vec<_>>().concat(),
508 md1.to_string(),
509 );
510 remove_file("__tmp_test.md").unwrap();
511 }
512}