Skip to main content

hmd_parse/
lib.rs

1use hmd_core::{
2    Diagnostic, DuplicateIdRecord, FenceInfo, HmdBlock, HmdDocument, HmdNode, IdRecord,
3    MarkdownNode, RefRecord, ReferenceIndex, SourceSpan, TomlValueObject,
4};
5use serde_json::{Map, Number, Value};
6use std::collections::BTreeMap;
7use toml_edit::DocumentMut;
8
9const BUILTIN_PROFILES: &[&str] = &["general@0.1", "decision@0.1", "progress@0.1", "todo@0.1"];
10
11pub fn parse_document(source: &str) -> HmdDocument {
12    let lines = collect_lines(source);
13    let mut diagnostics = Vec::new();
14    let frontmatter = parse_frontmatter(source, &lines, &mut diagnostics);
15
16    let mut parser = BlockParser {
17        source,
18        lines: &lines,
19        diagnostics: &mut diagnostics,
20    };
21    let mut index = frontmatter.body_start_line_index;
22    let parsed = parser.parse_nodes(&mut index, None);
23
24    let references = build_reference_index(&parsed.children, &mut diagnostics);
25
26    let mut document = HmdDocument::new(frontmatter.hmd_version, frontmatter.profile);
27    document.uses = frontmatter.uses;
28    document.meta = frontmatter.meta;
29    document.children = parsed.children;
30    document.references = references;
31    document.diagnostics = diagnostics;
32    document
33}
34
35#[derive(Debug)]
36struct Frontmatter {
37    hmd_version: String,
38    profile: String,
39    uses: Vec<String>,
40    meta: TomlValueObject,
41    body_start_line_index: usize,
42}
43
44fn parse_frontmatter(
45    source: &str,
46    lines: &[Line<'_>],
47    diagnostics: &mut Vec<Diagnostic>,
48) -> Frontmatter {
49    let mut frontmatter = Frontmatter {
50        hmd_version: String::new(),
51        profile: String::new(),
52        uses: Vec::new(),
53        meta: Map::new(),
54        body_start_line_index: 0,
55    };
56
57    let Some(first_line) = lines.first() else {
58        diagnostics.push(Diagnostic::error("HMD001", "missing document frontmatter"));
59        return frontmatter;
60    };
61
62    if !is_frontmatter_delimiter(first_line, true) {
63        diagnostics.push(
64            Diagnostic::error("HMD001", "missing document frontmatter")
65                .with_source(line_span(first_line)),
66        );
67        return frontmatter;
68    }
69
70    let closing_index = lines
71        .iter()
72        .enumerate()
73        .skip(1)
74        .find_map(|(index, line)| is_frontmatter_delimiter(line, false).then_some(index));
75
76    let Some(closing_index) = closing_index else {
77        diagnostics.push(
78            Diagnostic::error("HMD002", "unterminated document frontmatter")
79                .with_source(line_span(first_line)),
80        );
81        frontmatter.body_start_line_index = lines.len();
82        return frontmatter;
83    };
84
85    let toml_start = lines
86        .get(1)
87        .map(|line| line.start)
88        .unwrap_or(first_line.end);
89    let toml_end = lines[closing_index].start;
90    let toml_source = &source[toml_start..toml_end];
91
92    match parse_toml_object(toml_source) {
93        Ok(meta) => {
94            frontmatter.hmd_version = string_field(&meta, "hmd").unwrap_or_default();
95            frontmatter.profile = string_field(&meta, "profile").unwrap_or_default();
96            frontmatter.uses = string_array_field(&meta, "uses");
97            frontmatter.meta = meta;
98        }
99        Err(message) => diagnostics.push(
100            Diagnostic::error(
101                "HMD002",
102                format!("invalid document frontmatter TOML: {message}"),
103            )
104            .with_source(line_span(first_line)),
105        ),
106    }
107
108    if !frontmatter.profile.is_empty() && !BUILTIN_PROFILES.contains(&frontmatter.profile.as_str())
109    {
110        diagnostics.push(
111            Diagnostic::error(
112                "HMD007",
113                format!("unknown profile '{}'", frontmatter.profile),
114            )
115            .with_source(line_span(first_line)),
116        );
117    }
118
119    for profile in &frontmatter.uses {
120        if !BUILTIN_PROFILES.contains(&profile.as_str()) {
121            diagnostics.push(
122                Diagnostic::error("HMD007", format!("unknown profile '{profile}'"))
123                    .with_source(line_span(first_line)),
124            );
125        }
126    }
127
128    frontmatter.body_start_line_index = closing_index + 1;
129    frontmatter
130}
131
132struct BlockParser<'a, 'd> {
133    source: &'a str,
134    lines: &'a [Line<'a>],
135    diagnostics: &'d mut Vec<Diagnostic>,
136}
137
138#[derive(Debug)]
139struct ParsedNodes {
140    children: Vec<HmdNode>,
141    close_line: Option<usize>,
142}
143
144impl BlockParser<'_, '_> {
145    fn parse_nodes(&mut self, index: &mut usize, close_fence_length: Option<usize>) -> ParsedNodes {
146        let mut children = Vec::new();
147        let mut markdown = String::new();
148
149        while *index < self.lines.len() {
150            let line = &self.lines[*index];
151
152            if let Some(close_length) = close_fence_length {
153                if is_closer(line, close_length) {
154                    flush_markdown(&mut markdown, &mut children, true);
155                    let close_line = line.number;
156                    *index += 1;
157                    return ParsedNodes {
158                        children,
159                        close_line: Some(close_line),
160                    };
161                }
162            }
163
164            if let Some(opener) = parse_opener(line) {
165                flush_markdown(&mut markdown, &mut children, false);
166                let block = self.parse_block(index, opener);
167                children.push(HmdNode::HmdBlock(Box::new(block)));
168                continue;
169            }
170
171            markdown.push_str(line.raw);
172            *index += 1;
173        }
174
175        flush_markdown(&mut markdown, &mut children, false);
176        ParsedNodes {
177            children,
178            close_line: None,
179        }
180    }
181
182    fn parse_block(&mut self, index: &mut usize, opener: Opener) -> HmdBlock {
183        let opener_line = &self.lines[*index];
184        if !is_valid_block_type(&opener.block_type) {
185            self.diagnostics.push(
186                Diagnostic::error(
187                    "HMD004",
188                    format!("invalid block type '{}'", opener.block_type),
189                )
190                .with_source(line_span(opener_line)),
191            );
192        }
193
194        *index += 1;
195
196        let meta = if self
197            .lines
198            .get(*index)
199            .is_some_and(|line| is_frontmatter_delimiter(line, false))
200        {
201            self.parse_block_meta(index)
202        } else {
203            Map::new()
204        };
205
206        let parsed = self.parse_nodes(index, Some(opener.fence_length));
207        if parsed.close_line.is_none() {
208            self.diagnostics.push(
209                Diagnostic::error(
210                    "HMD003",
211                    format!("unclosed semantic block '{}'", opener.block_type),
212                )
213                .with_source(line_span(opener_line)),
214            );
215        }
216
217        let id = string_field(&meta, "id");
218        HmdBlock {
219            block_type: opener.block_type,
220            id,
221            meta,
222            children: parsed.children,
223            body: None,
224            source: None,
225            diagnostics: None,
226            fence: Some(FenceInfo {
227                fence_char: ':',
228                length: opener.fence_length,
229                open_line: opener_line.number,
230                close_line: parsed.close_line,
231                open_span: None,
232                close_span: None,
233            }),
234            profile: None,
235            validation: None,
236        }
237    }
238
239    fn parse_block_meta(&mut self, index: &mut usize) -> TomlValueObject {
240        let open_line = &self.lines[*index];
241        *index += 1;
242
243        let toml_start = self
244            .lines
245            .get(*index)
246            .map(|line| line.start)
247            .unwrap_or(open_line.end);
248
249        let closing_index =
250            self.lines
251                .iter()
252                .enumerate()
253                .skip(*index)
254                .find_map(|(candidate, line)| {
255                    is_frontmatter_delimiter(line, false).then_some(candidate)
256                });
257
258        let Some(closing_index) = closing_index else {
259            self.diagnostics.push(
260                Diagnostic::error("HMD002", "unterminated block metadata")
261                    .with_source(line_span(open_line)),
262            );
263            *index = self.lines.len();
264            return Map::new();
265        };
266
267        let toml_source = &self.source[toml_start..self.lines[closing_index].start];
268        *index = closing_index + 1;
269
270        match parse_toml_object(toml_source) {
271            Ok(meta) => meta,
272            Err(message) => {
273                self.diagnostics.push(
274                    Diagnostic::error("HMD002", format!("invalid block metadata TOML: {message}"))
275                        .with_source(line_span(open_line)),
276                );
277                Map::new()
278            }
279        }
280    }
281}
282
283fn flush_markdown(markdown: &mut String, children: &mut Vec<HmdNode>, trim_trailing_blank: bool) {
284    let text = trim_markdown_slice(markdown, trim_trailing_blank);
285    markdown.clear();
286
287    if text.trim().is_empty() {
288        return;
289    }
290
291    children.push(HmdNode::Markdown(MarkdownNode::source_slice(text)));
292}
293
294fn trim_markdown_slice(text: &str, trim_trailing_blank: bool) -> String {
295    let parts = split_preserving_newlines(text);
296    let mut start = 0;
297    let mut end = parts.len();
298
299    while start < end && is_blank_markdown_line(parts[start]) {
300        start += 1;
301    }
302
303    if trim_trailing_blank {
304        while end > start && is_blank_markdown_line(parts[end - 1]) {
305            end -= 1;
306        }
307    }
308
309    parts[start..end].concat()
310}
311
312fn split_preserving_newlines(text: &str) -> Vec<&str> {
313    if text.is_empty() {
314        return Vec::new();
315    }
316
317    let mut parts = Vec::new();
318    let mut start = 0;
319    for (index, byte) in text.bytes().enumerate() {
320        if byte == b'\n' {
321            parts.push(&text[start..=index]);
322            start = index + 1;
323        }
324    }
325    if start < text.len() {
326        parts.push(&text[start..]);
327    }
328    parts
329}
330
331fn is_blank_markdown_line(line: &str) -> bool {
332    let without_lf = line.strip_suffix('\n').unwrap_or(line);
333    let without_crlf = without_lf.strip_suffix('\r').unwrap_or(without_lf);
334    trim_horizontal(without_crlf).is_empty()
335}
336
337#[derive(Debug, Clone)]
338struct Opener {
339    fence_length: usize,
340    block_type: String,
341}
342
343fn parse_opener(line: &Line<'_>) -> Option<Opener> {
344    let fence_length = line
345        .content
346        .bytes()
347        .take_while(|byte| *byte == b':')
348        .count();
349    if fence_length < 3 {
350        return None;
351    }
352
353    let rest = &line.content[fence_length..];
354    let block_type = trim_horizontal(rest);
355    if block_type.is_empty() {
356        return None;
357    }
358
359    Some(Opener {
360        fence_length,
361        block_type: block_type.to_string(),
362    })
363}
364
365fn is_closer(line: &Line<'_>, opener_length: usize) -> bool {
366    let trimmed = trim_horizontal(line.content);
367    let colon_count = trimmed.bytes().take_while(|byte| *byte == b':').count();
368    colon_count >= opener_length && colon_count == trimmed.len()
369}
370
371fn is_valid_block_type(block_type: &str) -> bool {
372    let mut parts = block_type.split('.');
373    let Some(first) = parts.next() else {
374        return false;
375    };
376
377    if first.is_empty() || !is_valid_identifier_segment(first) {
378        return false;
379    }
380
381    parts.all(|part| !part.is_empty() && is_valid_identifier_segment(part))
382}
383
384fn is_valid_identifier_segment(segment: &str) -> bool {
385    let mut chars = segment.chars();
386    let Some(first) = chars.next() else {
387        return false;
388    };
389
390    if !first.is_ascii_alphabetic() {
391        return false;
392    }
393
394    chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_' || ch == '-')
395}
396
397fn build_reference_index(
398    children: &[HmdNode],
399    diagnostics: &mut Vec<Diagnostic>,
400) -> ReferenceIndex {
401    let mut records = Vec::new();
402    collect_block_records(children, None, &mut records);
403
404    let mut by_id: BTreeMap<String, Vec<BlockRecord>> = BTreeMap::new();
405    for record in records.iter().filter(|record| record.id.is_some()) {
406        by_id
407            .entry(record.id.clone().expect("filtered by id"))
408            .or_default()
409            .push(record.clone());
410    }
411
412    let mut references = ReferenceIndex::default();
413    for (id, matching_records) in by_id {
414        if matching_records.len() == 1 {
415            let record = &matching_records[0];
416            references.ids.insert(
417                id.clone(),
418                IdRecord {
419                    id,
420                    path: record.path.clone(),
421                    block_type: Some(record.block_type.clone()),
422                    source: None,
423                },
424            );
425        } else {
426            let paths = matching_records
427                .iter()
428                .map(|record| record.path.clone())
429                .collect::<Vec<_>>();
430            references.duplicates.push(DuplicateIdRecord {
431                id: id.clone(),
432                paths,
433            });
434            diagnostics.push(
435                Diagnostic::error("HMD006", format!("duplicate block id '{id}'"))
436                    .with_path(format!("/blocks/{id}")),
437            );
438        }
439    }
440
441    for record in &records {
442        if record.block_type == "recommendation" {
443            if let Some(target) = string_field(&record.meta, "option") {
444                let target_record = references.ids.get(&target);
445                let resolved = target_record
446                    .and_then(|id_record| id_record.block_type.as_deref())
447                    .is_some_and(|block_type| block_type == "option");
448
449                references.refs.push(RefRecord {
450                    from: record.path.clone(),
451                    field: "option".to_string(),
452                    target,
453                    resolved: Some(resolved),
454                    target_path: target_record
455                        .filter(|_| resolved)
456                        .map(|id_record| id_record.path.clone()),
457                    source: None,
458                });
459            }
460        }
461    }
462
463    references
464}
465
466#[derive(Debug, Clone)]
467struct BlockRecord {
468    block_type: String,
469    id: Option<String>,
470    path: String,
471    meta: TomlValueObject,
472}
473
474fn collect_block_records(
475    children: &[HmdNode],
476    parent_path: Option<&str>,
477    records: &mut Vec<BlockRecord>,
478) {
479    for (index, node) in children.iter().enumerate() {
480        let Some(block) = node.as_block() else {
481            continue;
482        };
483
484        let child_path = match parent_path {
485            Some(parent_path) => format!("{parent_path}/children/{index}"),
486            None => format!("/children/{index}"),
487        };
488        let path = block
489            .id
490            .as_ref()
491            .map(|id| format!("/blocks/{id}"))
492            .unwrap_or(child_path);
493
494        records.push(BlockRecord {
495            block_type: block.block_type.clone(),
496            id: block.id.clone(),
497            path: path.clone(),
498            meta: block.meta.clone(),
499        });
500        collect_block_records(&block.children, Some(&path), records);
501    }
502}
503
504fn parse_toml_object(source: &str) -> Result<TomlValueObject, String> {
505    source
506        .parse::<DocumentMut>()
507        .map_err(|error| error.to_string())?;
508
509    let value = toml::from_str::<toml::Value>(source).map_err(|error| error.to_string())?;
510    match toml_value_to_json(value) {
511        Value::Object(object) => Ok(object),
512        _ => Ok(Map::new()),
513    }
514}
515
516fn toml_value_to_json(value: toml::Value) -> Value {
517    match value {
518        toml::Value::String(value) => Value::String(value),
519        toml::Value::Integer(value) => Value::Number(Number::from(value)),
520        toml::Value::Float(value) => Number::from_f64(value).map_or(Value::Null, Value::Number),
521        toml::Value::Boolean(value) => Value::Bool(value),
522        toml::Value::Datetime(value) => Value::String(value.to_string()),
523        toml::Value::Array(values) => {
524            Value::Array(values.into_iter().map(toml_value_to_json).collect())
525        }
526        toml::Value::Table(values) => {
527            let object = values
528                .into_iter()
529                .map(|(key, value)| (key, toml_value_to_json(value)))
530                .collect();
531            Value::Object(object)
532        }
533    }
534}
535
536fn string_field(meta: &TomlValueObject, field: &str) -> Option<String> {
537    meta.get(field)
538        .and_then(Value::as_str)
539        .map(ToString::to_string)
540}
541
542fn string_array_field(meta: &TomlValueObject, field: &str) -> Vec<String> {
543    meta.get(field)
544        .and_then(Value::as_array)
545        .map(|values| {
546            values
547                .iter()
548                .filter_map(Value::as_str)
549                .map(ToString::to_string)
550                .collect()
551        })
552        .unwrap_or_default()
553}
554
555#[derive(Debug, Clone)]
556struct Line<'a> {
557    raw: &'a str,
558    content: &'a str,
559    start: usize,
560    end: usize,
561    number: usize,
562}
563
564fn collect_lines(source: &str) -> Vec<Line<'_>> {
565    let mut lines = Vec::new();
566    let mut start = 0;
567    let mut number = 1;
568
569    for raw in source.split_inclusive('\n') {
570        let end = start + raw.len();
571        lines.push(Line {
572            raw,
573            content: strip_line_ending(raw),
574            start,
575            end,
576            number,
577        });
578        start = end;
579        number += 1;
580    }
581
582    if start < source.len() {
583        let raw = &source[start..];
584        lines.push(Line {
585            raw,
586            content: strip_line_ending(raw),
587            start,
588            end: source.len(),
589            number,
590        });
591    }
592
593    lines
594}
595
596fn strip_line_ending(line: &str) -> &str {
597    let without_lf = line.strip_suffix('\n').unwrap_or(line);
598    without_lf.strip_suffix('\r').unwrap_or(without_lf)
599}
600
601fn is_frontmatter_delimiter(line: &Line<'_>, allow_bom: bool) -> bool {
602    let content = if allow_bom {
603        line.content
604            .strip_prefix('\u{feff}')
605            .unwrap_or(line.content)
606    } else {
607        line.content
608    };
609    trim_horizontal(content) == "+++"
610}
611
612fn trim_horizontal(value: &str) -> &str {
613    value.trim_matches(|ch| ch == ' ' || ch == '\t')
614}
615
616fn line_span(line: &Line<'_>) -> SourceSpan {
617    SourceSpan {
618        start: line.start,
619        end: line.end,
620        start_line: line.number,
621        start_column: 1,
622        end_line: line.number,
623        end_column: line.content.chars().count() + 1,
624    }
625}
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630    use std::fs;
631    use std::path::{Path, PathBuf};
632
633    #[test]
634    fn frontmatter_valid_todo_has_profile_metadata() {
635        let document = parse_fixture("fixtures/valid/todo-basic.hmd");
636
637        assert_eq!(document.hmd_version, "0.1");
638        assert_eq!(document.profile, "todo@0.1");
639        assert!(document.uses.is_empty());
640        assert!(!has_diagnostic(&document, "HMD001"));
641        assert!(!has_diagnostic(&document, "HMD002"));
642        assert!(!has_diagnostic(&document, "HMD007"));
643    }
644
645    #[test]
646    fn parse_todo_basic_matches_fixture() {
647        assert_parse_matches_fixture(
648            "fixtures/valid/todo-basic.hmd",
649            "fixtures/ir/todo-basic.json",
650        );
651    }
652
653    #[test]
654    fn parse_decision_basic_matches_fixture() {
655        assert_parse_matches_fixture(
656            "fixtures/valid/decision-basic.hmd",
657            "fixtures/ir/decision-basic.json",
658        );
659    }
660
661    #[test]
662    fn invalid_missing_frontmatter_reports_hmd001() {
663        assert_fixture_reports("fixtures/invalid/missing-frontmatter.hmd", "HMD001");
664    }
665
666    #[test]
667    fn invalid_toml_reports_hmd002() {
668        assert_fixture_reports("fixtures/invalid/invalid-toml.hmd", "HMD002");
669    }
670
671    #[test]
672    fn invalid_unclosed_block_reports_hmd003() {
673        assert_fixture_reports("fixtures/invalid/unclosed-block.hmd", "HMD003");
674    }
675
676    #[test]
677    fn invalid_block_type_reports_hmd004() {
678        assert_fixture_reports("fixtures/invalid/invalid-block-type.hmd", "HMD004");
679    }
680
681    #[test]
682    fn invalid_duplicate_id_reports_hmd006() {
683        assert_fixture_reports("fixtures/invalid/duplicate-id.hmd", "HMD006");
684    }
685
686    #[test]
687    fn invalid_unknown_profile_reports_hmd007() {
688        assert_fixture_reports("fixtures/invalid/unknown-profile.hmd", "HMD007");
689    }
690
691    fn assert_parse_matches_fixture(source_path: &str, expected_path: &str) {
692        let document = parse_fixture(source_path);
693        let actual = serde_json::to_value(document).expect("serializes parsed document");
694        let expected_source =
695            fs::read_to_string(repo_path(expected_path)).expect("reads expected fixture");
696        let expected: Value =
697            serde_json::from_str(&expected_source).expect("expected fixture is valid JSON");
698
699        assert_eq!(actual, expected);
700    }
701
702    fn assert_fixture_reports(path: &str, code: &str) {
703        let document = parse_fixture(path);
704        assert!(
705            has_diagnostic(&document, code),
706            "expected diagnostic {code}, got {:?}",
707            document.diagnostics
708        );
709    }
710
711    fn has_diagnostic(document: &HmdDocument, code: &str) -> bool {
712        document
713            .diagnostics
714            .iter()
715            .any(|diagnostic| diagnostic.code == code)
716    }
717
718    fn parse_fixture(path: &str) -> HmdDocument {
719        let source = fs::read_to_string(repo_path(path)).expect("reads source fixture");
720        parse_document(&source)
721    }
722
723    fn repo_path(path: impl AsRef<Path>) -> PathBuf {
724        Path::new(env!("CARGO_MANIFEST_DIR"))
725            .join("../..")
726            .join(path)
727    }
728}