weave_content/
parser.rs

1#![allow(clippy::module_name_repetitions)]
2
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7/// Maximum length of a case ID (kebab-case identifier).
8const MAX_CASE_ID_LEN: usize = 60;
9
10/// Maximum number of sources in front matter.
11const MAX_SOURCES: usize = 20;
12
13/// Maximum length of the case title (H1).
14const MAX_TITLE_LEN: usize = 200;
15
16/// Maximum length of the case summary.
17const MAX_SUMMARY_LEN: usize = 2000;
18
19/// Known H2 section names for case files (case-insensitive match).
20/// People and Organizations are no longer allowed in case files -- they
21/// live in standalone entity files under `people/` and `organizations/`.
22const KNOWN_CASE_SECTIONS: &[&str] =
23    &["Events", "Documents", "Assets", "Relationships", "Timeline"];
24
25/// A parsed case file with front matter, title, summary, and raw sections.
26#[derive(Debug)]
27pub struct ParsedCase {
28    pub id: String,
29    /// NULID for the case node (None if not yet generated).
30    pub nulid: Option<String>,
31    pub sources: Vec<SourceEntry>,
32    pub title: String,
33    pub summary: String,
34    pub sections: Vec<Section>,
35    /// Case type from front matter (e.g. `corruption`, `fraud`).
36    pub case_type: Option<String>,
37    /// Case status from front matter (e.g. `open`, `trial`).
38    pub status: Option<String>,
39    /// Tags from front matter for categorization.
40    pub tags: Vec<String>,
41}
42
43/// A raw H2 section with its heading text and body content.
44#[derive(Debug)]
45pub struct Section {
46    pub kind: SectionKind,
47    pub body: String,
48    /// Line number (1-indexed) where the H2 heading appears in the original file.
49    pub line: usize,
50}
51
52/// The type of an H2 section, mapped from heading text.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum SectionKind {
55    People,
56    Organizations,
57    Events,
58    Documents,
59    Assets,
60    Relationships,
61    Timeline,
62}
63
64impl SectionKind {
65    fn from_heading(heading: &str) -> Option<Self> {
66        match heading.trim() {
67            s if s.eq_ignore_ascii_case("People") => Some(Self::People),
68            s if s.eq_ignore_ascii_case("Organizations") => Some(Self::Organizations),
69            s if s.eq_ignore_ascii_case("Events") => Some(Self::Events),
70            s if s.eq_ignore_ascii_case("Documents") => Some(Self::Documents),
71            s if s.eq_ignore_ascii_case("Assets") => Some(Self::Assets),
72            s if s.eq_ignore_ascii_case("Relationships") => Some(Self::Relationships),
73            s if s.eq_ignore_ascii_case("Timeline") => Some(Self::Timeline),
74            _ => None,
75        }
76    }
77
78    /// Whether this section kind is valid in case files.
79    /// People and Organizations are no longer allowed in case files.
80    pub fn is_case_section(self) -> bool {
81        matches!(
82            self,
83            Self::Events | Self::Documents | Self::Assets | Self::Relationships | Self::Timeline
84        )
85    }
86}
87
88/// A parser error with file location.
89#[derive(Debug)]
90pub struct ParseError {
91    pub line: usize,
92    pub message: String,
93}
94
95impl fmt::Display for ParseError {
96    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
97        write!(f, "line {}: {}", self.line, self.message)
98    }
99}
100
101/// Maximum number of tags per case file.
102const MAX_CASE_TAGS: usize = 10;
103
104/// Maximum number of tags per entity file.
105const MAX_ENTITY_TAGS: usize = 5;
106
107/// Maximum length of a single tag.
108const MAX_TAG_LEN: usize = 50;
109
110/// YAML front matter schema.
111#[derive(Deserialize)]
112struct FrontMatter {
113    id: String,
114    /// NULID for the case node (auto-generated on first build).
115    #[serde(default)]
116    nulid: Option<String>,
117    #[serde(default)]
118    sources: Vec<SourceEntry>,
119    #[serde(default)]
120    case_type: Option<String>,
121    #[serde(default)]
122    status: Option<String>,
123    #[serde(default)]
124    tags: Vec<String>,
125}
126
127/// A source entry in front matter. Supports both bare URL strings and
128/// structured objects with metadata.
129#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
130#[serde(untagged)]
131pub enum SourceEntry {
132    /// Plain URL string (backward-compatible).
133    Url(String),
134    /// Structured source with metadata.
135    Structured {
136        url: String,
137        #[serde(default)]
138        title: Option<String>,
139        #[serde(default)]
140        published_at: Option<String>,
141        #[serde(default)]
142        language: Option<String>,
143    },
144}
145
146impl SourceEntry {
147    /// Get the URL from any source entry variant.
148    pub fn url(&self) -> &str {
149        match self {
150            Self::Url(u) => u,
151            Self::Structured { url, .. } => url,
152        }
153    }
154}
155
156/// YAML front matter schema for standalone entity files.
157/// Only contains an optional `id` field (NULID, generated on first build).
158#[derive(Deserialize)]
159struct EntityFrontMatter {
160    #[serde(default)]
161    id: Option<String>,
162    #[serde(default)]
163    tags: Vec<String>,
164}
165
166/// A parsed standalone entity file (actor or institution).
167#[derive(Debug)]
168pub struct ParsedEntityFile {
169    /// Stored NULID from front matter (None if not yet generated).
170    pub id: Option<String>,
171    /// Entity name from H1 heading.
172    pub name: String,
173    /// Raw bullet field lines (body after H1, no sections).
174    pub body: String,
175    /// Line number of the H1 heading in the original file.
176    pub title_line: usize,
177    /// Tags from front matter.
178    pub tags: Vec<String>,
179}
180
181/// Parse a Markdown case file into a `ParsedCase`.
182///
183/// Extracts YAML front matter, H1 title, summary, and H2 sections.
184/// Returns errors for malformed structure or boundary violations.
185pub fn parse(input: &str) -> Result<ParsedCase, Vec<ParseError>> {
186    let mut errors = Vec::new();
187
188    // Extract front matter
189    let (front_matter, body_start_line, body) = extract_front_matter(input, &mut errors);
190
191    let Some(front_matter) = front_matter else {
192        if errors.is_empty() {
193            errors.push(ParseError {
194                line: 1,
195                message: "missing YAML front matter (expected `---` delimiter)".into(),
196            });
197        }
198        return Err(errors);
199    };
200
201    // Validate front matter fields
202    validate_front_matter(&front_matter, &mut errors);
203
204    // Extract title, summary, and sections from body
205    let (title, summary, sections) = extract_body(&body, body_start_line, &mut errors);
206
207    if !errors.is_empty() {
208        return Err(errors);
209    }
210
211    Ok(ParsedCase {
212        id: front_matter.id,
213        nulid: front_matter.nulid,
214        sources: front_matter.sources,
215        title,
216        summary,
217        sections,
218        case_type: front_matter.case_type,
219        status: front_matter.status,
220        tags: front_matter.tags,
221    })
222}
223
224/// Parse a standalone entity file (actor or institution).
225///
226/// Entity files have YAML front matter with optional `id:`, an H1 name,
227/// and bullet fields directly in the body. No H2 sections are allowed.
228pub fn parse_entity_file(input: &str) -> Result<ParsedEntityFile, Vec<ParseError>> {
229    let mut errors = Vec::new();
230
231    let (front_matter, body_start_line, body) = extract_entity_front_matter(input, &mut errors);
232
233    let id = front_matter.as_ref().and_then(|fm| fm.id.clone());
234    let tags = front_matter.map_or_else(Vec::new, |fm| fm.tags);
235
236    // Validate entity tags
237    if tags.len() > MAX_ENTITY_TAGS {
238        errors.push(ParseError {
239            line: 2,
240            message: format!(
241                "front matter `tags` exceeds {MAX_ENTITY_TAGS} entries (got {})",
242                tags.len()
243            ),
244        });
245    }
246    for (i, tag) in tags.iter().enumerate() {
247        if tag.len() > MAX_TAG_LEN {
248            errors.push(ParseError {
249                line: 2,
250                message: format!("front matter tag #{} exceeds {MAX_TAG_LEN} chars", i + 1),
251            });
252        }
253        if tag.is_empty() {
254            errors.push(ParseError {
255                line: 2,
256                message: format!("front matter tag #{} is empty", i + 1),
257            });
258        }
259    }
260
261    // Extract H1 title and body content (no sections allowed)
262    let (name, title_line, field_body) = extract_entity_body(&body, body_start_line, &mut errors);
263
264    if !errors.is_empty() {
265        return Err(errors);
266    }
267
268    Ok(ParsedEntityFile {
269        id,
270        name,
271        body: field_body,
272        title_line,
273        tags,
274    })
275}
276
277/// Extract YAML front matter for entity files.
278/// Front matter is optional for entity files -- if absent, returns None with no error.
279fn extract_entity_front_matter(
280    input: &str,
281    errors: &mut Vec<ParseError>,
282) -> (Option<EntityFrontMatter>, usize, String) {
283    let lines: Vec<&str> = input.lines().collect();
284
285    let first_delim = lines.iter().position(|l| l.trim() == "---");
286    if first_delim != Some(0) {
287        // No front matter -- entire file is body, starting at line 1
288        return (None, 1, input.to_string());
289    }
290
291    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
292    let Some(close_offset) = close_delim else {
293        errors.push(ParseError {
294            line: 1,
295            message: "unclosed YAML front matter (missing closing `---`)".into(),
296        });
297        return (None, 1, String::new());
298    };
299
300    let close_line = close_offset + 1;
301    let yaml_str: String = lines[1..close_line].join("\n");
302    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
303    let body = lines[close_line + 1..].join("\n");
304
305    match serde_yaml::from_str::<EntityFrontMatter>(&yaml_str) {
306        Ok(fm) => (Some(fm), body_start_line, body),
307        Err(e) => {
308            errors.push(ParseError {
309                line: 2,
310                message: format!("invalid YAML front matter: {e}"),
311            });
312            (None, body_start_line, body)
313        }
314    }
315}
316
317/// Extract H1 name and field body from an entity file.
318/// Rejects any H2 sections.
319fn extract_entity_body(
320    body: &str,
321    body_start_line: usize,
322    errors: &mut Vec<ParseError>,
323) -> (String, usize, String) {
324    let lines: Vec<&str> = body.lines().collect();
325    let mut name = String::new();
326    let mut title_found = false;
327    let mut title_line = body_start_line;
328    let mut field_lines: Vec<&str> = Vec::new();
329
330    for (i, line) in lines.iter().enumerate() {
331        let file_line = body_start_line + i;
332
333        if let Some(heading) = strip_heading(line, 1) {
334            if title_found {
335                errors.push(ParseError {
336                    line: file_line,
337                    message: "multiple H1 headings found (expected exactly one)".into(),
338                });
339                continue;
340            }
341            name = heading.to_string();
342            title_found = true;
343            title_line = file_line;
344            continue;
345        }
346
347        // Reject H2 sections in entity files
348        if strip_heading(line, 2).is_some() {
349            errors.push(ParseError {
350                line: file_line,
351                message: "H2 sections are not allowed in entity files".into(),
352            });
353            continue;
354        }
355
356        if title_found {
357            field_lines.push(line);
358        } else if !line.trim().is_empty() {
359            errors.push(ParseError {
360                line: file_line,
361                message: "expected H1 heading (# Name)".into(),
362            });
363        }
364    }
365
366    if !title_found {
367        errors.push(ParseError {
368            line: body_start_line,
369            message: "missing H1 heading".into(),
370        });
371    } else if name.len() > MAX_TITLE_LEN {
372        errors.push(ParseError {
373            line: title_line,
374            message: format!("H1 name exceeds {MAX_TITLE_LEN} chars (got {})", name.len()),
375        });
376    }
377
378    (name, title_line, field_lines.join("\n"))
379}
380
381/// Extract YAML front matter delimited by `---` lines.
382/// Returns the parsed front matter, the line number where the body starts,
383/// and the body text.
384fn extract_front_matter(
385    input: &str,
386    errors: &mut Vec<ParseError>,
387) -> (Option<FrontMatter>, usize, String) {
388    let lines: Vec<&str> = input.lines().collect();
389
390    // First non-empty line must be `---`
391    let first_delim = lines.iter().position(|l| l.trim() == "---");
392    if first_delim != Some(0) {
393        errors.push(ParseError {
394            line: 1,
395            message: "missing YAML front matter (expected `---` on first line)".into(),
396        });
397        return (None, 1, input.to_string());
398    }
399
400    // Find closing `---`
401    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
402    let Some(close_offset) = close_delim else {
403        errors.push(ParseError {
404            line: 1,
405            message: "unclosed YAML front matter (missing closing `---`)".into(),
406        });
407        return (None, 1, String::new());
408    };
409
410    let close_line = close_offset + 1; // index in `lines`
411    let yaml_str: String = lines[1..close_line].join("\n");
412    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
413    let body = lines[close_line + 1..].join("\n");
414
415    match serde_yaml::from_str::<FrontMatter>(&yaml_str) {
416        Ok(fm) => (Some(fm), body_start_line, body),
417        Err(e) => {
418            errors.push(ParseError {
419                line: 2,
420                message: format!("invalid YAML front matter: {e}"),
421            });
422            (None, body_start_line, body)
423        }
424    }
425}
426
427fn validate_front_matter(fm: &FrontMatter, errors: &mut Vec<ParseError>) {
428    // Validate case ID
429    if fm.id.is_empty() {
430        errors.push(ParseError {
431            line: 2,
432            message: "front matter `id` must not be empty".into(),
433        });
434    } else if fm.id.len() > MAX_CASE_ID_LEN {
435        errors.push(ParseError {
436            line: 2,
437            message: format!(
438                "front matter `id` exceeds {MAX_CASE_ID_LEN} chars (got {})",
439                fm.id.len()
440            ),
441        });
442    } else if !is_kebab_case(&fm.id) {
443        errors.push(ParseError {
444            line: 2,
445            message: format!(
446                "front matter `id` must be kebab-case [a-z0-9-], got {:?}",
447                fm.id
448            ),
449        });
450    }
451
452    // Validate sources count
453    if fm.sources.len() > MAX_SOURCES {
454        errors.push(ParseError {
455            line: 2,
456            message: format!(
457                "front matter `sources` exceeds {MAX_SOURCES} entries (got {})",
458                fm.sources.len()
459            ),
460        });
461    }
462
463    // Validate each source URL is HTTPS
464    for (i, source) in fm.sources.iter().enumerate() {
465        if !source.url().starts_with("https://") {
466            errors.push(ParseError {
467                line: 2,
468                message: format!("source[{i}] must be HTTPS, got {:?}", source.url()),
469            });
470        }
471    }
472
473    // Validate case_type
474    if let Some(ct) = &fm.case_type {
475        use crate::domain::CaseType;
476        let normalized = ct.to_lowercase().replace(' ', "_");
477        if !CaseType::KNOWN.contains(&normalized.as_str())
478            && crate::domain::parse_custom(ct).is_none()
479        {
480            errors.push(ParseError {
481                line: 2,
482                message: format!(
483                    "invalid case_type {:?} (known: {}; use \"custom:Value\" for custom)",
484                    ct,
485                    CaseType::KNOWN.join(", ")
486                ),
487            });
488        }
489    }
490
491    // Validate status
492    if let Some(st) = &fm.status {
493        use crate::domain::CaseStatus;
494        let normalized = st.to_lowercase().replace(' ', "_");
495        if !CaseStatus::KNOWN.contains(&normalized.as_str()) {
496            errors.push(ParseError {
497                line: 2,
498                message: format!(
499                    "invalid status {:?} (known: {})",
500                    st,
501                    CaseStatus::KNOWN.join(", ")
502                ),
503            });
504        }
505    }
506
507    // Validate tags
508    if fm.tags.len() > MAX_CASE_TAGS {
509        errors.push(ParseError {
510            line: 2,
511            message: format!(
512                "front matter `tags` exceeds {MAX_CASE_TAGS} entries (got {})",
513                fm.tags.len()
514            ),
515        });
516    }
517    for (i, tag) in fm.tags.iter().enumerate() {
518        if tag.len() > MAX_TAG_LEN {
519            errors.push(ParseError {
520                line: 2,
521                message: format!("tag[{i}] exceeds {MAX_TAG_LEN} chars (got {})", tag.len()),
522            });
523        }
524        if tag.is_empty() {
525            errors.push(ParseError {
526                line: 2,
527                message: format!("tag[{i}] must not be empty"),
528            });
529        }
530    }
531}
532
533/// Check if a string is valid kebab-case: `[a-z0-9](-[a-z0-9]+)*`
534fn is_kebab_case(s: &str) -> bool {
535    !s.is_empty()
536        && !s.starts_with('-')
537        && !s.ends_with('-')
538        && !s.contains("--")
539        && s.chars()
540            .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-')
541}
542
543/// Extract the H1 title, summary text, and H2 sections from the body.
544#[allow(clippy::too_many_lines)]
545fn extract_body(
546    body: &str,
547    body_start_line: usize,
548    errors: &mut Vec<ParseError>,
549) -> (String, String, Vec<Section>) {
550    let lines: Vec<&str> = body.lines().collect();
551    let mut title = String::new();
552    let mut title_found = false;
553    let mut summary_lines: Vec<&str> = Vec::new();
554    let mut sections: Vec<Section> = Vec::new();
555
556    // Track current H2 section being built
557    let mut current_section_kind: Option<SectionKind> = None;
558    let mut current_section_line: usize = 0;
559    let mut current_section_body: Vec<&str> = Vec::new();
560
561    // State: before H1, after H1 (summary), in sections
562    let mut state = State::BeforeTitle;
563
564    for (i, line) in lines.iter().enumerate() {
565        let file_line = body_start_line + i; // 1-indexed line in original file
566
567        if let Some(heading) = strip_heading(line, 1) {
568            if title_found {
569                errors.push(ParseError {
570                    line: file_line,
571                    message: "multiple H1 headings found (expected exactly one)".into(),
572                });
573                continue;
574            }
575            title = heading.to_string();
576            title_found = true;
577            state = State::Summary;
578            continue;
579        }
580
581        if let Some(heading) = strip_heading(line, 2) {
582            // Flush previous section
583            if let Some(kind) = current_section_kind.take() {
584                sections.push(Section {
585                    kind,
586                    body: current_section_body.join("\n"),
587                    line: current_section_line,
588                });
589                current_section_body.clear();
590            }
591
592            match SectionKind::from_heading(heading) {
593                Some(kind) if kind.is_case_section() => {
594                    // Check for duplicate sections
595                    if sections.iter().any(|s| s.kind == kind) {
596                        errors.push(ParseError {
597                            line: file_line,
598                            message: format!("duplicate section: ## {heading}"),
599                        });
600                    }
601                    current_section_kind = Some(kind);
602                    current_section_line = file_line;
603                    state = State::InSection;
604                }
605                Some(_) => {
606                    // Legacy section (People/Organizations) -- not allowed in case files
607                    errors.push(ParseError {
608                        line: file_line,
609                        message: format!(
610                            "## {heading} is not allowed in case files (use standalone entity files in people/ or organizations/ instead)"
611                        ),
612                    });
613                }
614                None => {
615                    errors.push(ParseError {
616                        line: file_line,
617                        message: format!(
618                            "unknown section: ## {heading} (expected one of: {})",
619                            KNOWN_CASE_SECTIONS.join(", ")
620                        ),
621                    });
622                }
623            }
624            continue;
625        }
626
627        match state {
628            State::BeforeTitle => {
629                // Skip blank lines before title
630                if !line.trim().is_empty() {
631                    errors.push(ParseError {
632                        line: file_line,
633                        message: "expected H1 title (# Title)".into(),
634                    });
635                }
636            }
637            State::Summary => {
638                summary_lines.push(line);
639            }
640            State::InSection => {
641                current_section_body.push(line);
642            }
643        }
644    }
645
646    // Flush last section
647    if let Some(kind) = current_section_kind.take() {
648        sections.push(Section {
649            kind,
650            body: current_section_body.join("\n"),
651            line: current_section_line,
652        });
653    }
654
655    // Validate title
656    if !title_found {
657        errors.push(ParseError {
658            line: body_start_line,
659            message: "missing H1 title".into(),
660        });
661    } else if title.len() > MAX_TITLE_LEN {
662        errors.push(ParseError {
663            line: body_start_line,
664            message: format!(
665                "H1 title exceeds {MAX_TITLE_LEN} chars (got {})",
666                title.len()
667            ),
668        });
669    }
670
671    // Build summary (trim leading/trailing blank lines)
672    let summary = summary_lines.clone().join("\n").trim().to_string();
673
674    if summary.len() > MAX_SUMMARY_LEN {
675        errors.push(ParseError {
676            line: body_start_line,
677            message: format!(
678                "summary exceeds {MAX_SUMMARY_LEN} chars (got {})",
679                summary.len()
680            ),
681        });
682    }
683
684    (title, summary, sections)
685}
686
687#[derive(Clone, Copy)]
688enum State {
689    BeforeTitle,
690    Summary,
691    InSection,
692}
693
694/// Strip an ATX heading prefix of the given level. Returns the heading text.
695/// E.g., `strip_heading("## Foo", 2)` returns `Some("Foo")`.
696fn strip_heading(line: &str, level: usize) -> Option<&str> {
697    let prefix = "#".repeat(level);
698    let trimmed = line.trim_start();
699    if trimmed.starts_with(&prefix) {
700        let after = &trimmed[prefix.len()..];
701        // Must be followed by space or end of line, and NOT more `#` chars
702        if after.is_empty() {
703            return Some("");
704        }
705        if after.starts_with(' ') && !after.starts_with(" #") {
706            // Actually, need to exclude `### Foo` when looking for `## Foo`
707            return Some(after[1..].trim());
708        }
709        // Check: `###` should not match `##`
710        if after.starts_with('#') {
711            return None;
712        }
713    }
714    None
715}
716
717#[cfg(test)]
718mod tests {
719    use super::*;
720
721    fn minimal_case() -> String {
722        [
723            "---",
724            "id: test-case",
725            "sources:",
726            "  - https://example.com/source",
727            "---",
728            "",
729            "# Test Case Title",
730            "",
731            "This is the summary.",
732            "",
733            "## Events",
734            "",
735            "### Something happened",
736            "- occurred_at: 2025-01-01",
737            "",
738            "## Relationships",
739            "",
740            "- Something happened -> Something happened: associate_of",
741        ]
742        .join("\n")
743    }
744
745    #[test]
746    fn parse_minimal_case() {
747        let result = parse(&minimal_case());
748        let case = result.unwrap_or_else(|errs| {
749            panic!(
750                "parse failed: {}",
751                errs.iter()
752                    .map(ToString::to_string)
753                    .collect::<Vec<_>>()
754                    .join("; ")
755            );
756        });
757
758        assert_eq!(case.id, "test-case");
759        assert_eq!(case.sources.len(), 1);
760        assert_eq!(case.sources[0].url(), "https://example.com/source");
761        assert_eq!(case.title, "Test Case Title");
762        assert_eq!(case.summary, "This is the summary.");
763        assert_eq!(case.sections.len(), 2);
764        assert_eq!(case.sections[0].kind, SectionKind::Events);
765        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
766    }
767
768    #[test]
769    fn parse_missing_front_matter() {
770        let input = "# Title\n\nSummary.\n";
771        let errs = parse(input).unwrap_err();
772        assert!(errs.iter().any(|e| e.message.contains("front matter")));
773    }
774
775    #[test]
776    fn parse_unclosed_front_matter() {
777        let input = "---\nid: test\n# Title\n";
778        let errs = parse(input).unwrap_err();
779        assert!(errs.iter().any(|e| e.message.contains("unclosed")));
780    }
781
782    #[test]
783    fn parse_invalid_case_id_uppercase() {
784        let input = "---\nid: Test-Case\nsources: []\n---\n\n# Title\n";
785        let errs = parse(input).unwrap_err();
786        assert!(errs.iter().any(|e| e.message.contains("kebab-case")));
787    }
788
789    #[test]
790    fn parse_case_id_too_long() {
791        let long_id = "a".repeat(61);
792        let input = format!("---\nid: {long_id}\nsources: []\n---\n\n# Title\n");
793        let errs = parse(&input).unwrap_err();
794        assert!(errs.iter().any(|e| e.message.contains("exceeds 60")));
795    }
796
797    #[test]
798    fn parse_non_https_source() {
799        let input = "---\nid: test\nsources:\n  - http://example.com\n---\n\n# Title\n";
800        let errs = parse(input).unwrap_err();
801        assert!(errs.iter().any(|e| e.message.contains("HTTPS")));
802    }
803
804    #[test]
805    fn parse_too_many_sources() {
806        let sources: Vec<String> = (0..21)
807            .map(|i| format!("  - https://example.com/{i}"))
808            .collect();
809        let input = format!(
810            "---\nid: test\nsources:\n{}\n---\n\n# Title\n",
811            sources.join("\n")
812        );
813        let errs = parse(&input).unwrap_err();
814        assert!(errs.iter().any(|e| e.message.contains("exceeds 20")));
815    }
816
817    #[test]
818    fn parse_unknown_section() {
819        let input = [
820            "---",
821            "id: test",
822            "sources: []",
823            "---",
824            "",
825            "# Title",
826            "",
827            "## Unknown Section",
828            "",
829        ]
830        .join("\n");
831        let errs = parse(&input).unwrap_err();
832        assert!(errs.iter().any(|e| e.message.contains("unknown section")));
833    }
834
835    #[test]
836    fn parse_duplicate_section() {
837        let input = [
838            "---",
839            "id: test",
840            "sources: []",
841            "---",
842            "",
843            "# Title",
844            "",
845            "## Events",
846            "",
847            "## Events",
848            "",
849        ]
850        .join("\n");
851        let errs = parse(&input).unwrap_err();
852        assert!(errs.iter().any(|e| e.message.contains("duplicate")));
853    }
854
855    #[test]
856    fn parse_multiple_h1() {
857        let input = [
858            "---",
859            "id: test",
860            "sources: []",
861            "---",
862            "",
863            "# First Title",
864            "",
865            "# Second Title",
866            "",
867        ]
868        .join("\n");
869        let errs = parse(&input).unwrap_err();
870        assert!(errs.iter().any(|e| e.message.contains("multiple H1")));
871    }
872
873    #[test]
874    fn parse_all_sections() {
875        let input = [
876            "---",
877            "id: full-case",
878            "sources:",
879            "  - https://example.com/a",
880            "---",
881            "",
882            "# Full Case",
883            "",
884            "Summary text here.",
885            "",
886            "## Events",
887            "",
888            "### Something happened",
889            "- occurred_at: 2025-01-01",
890            "",
891            "## Relationships",
892            "",
893            "- Alice -> Corp Inc: employed_by",
894            "",
895            "## Timeline",
896            "",
897            "Something happened",
898        ]
899        .join("\n");
900
901        let case = parse(&input).unwrap_or_else(|errs| {
902            panic!(
903                "parse failed: {}",
904                errs.iter()
905                    .map(ToString::to_string)
906                    .collect::<Vec<_>>()
907                    .join("; ")
908            );
909        });
910
911        assert_eq!(case.id, "full-case");
912        assert_eq!(case.title, "Full Case");
913        assert_eq!(case.summary, "Summary text here.");
914        assert_eq!(case.sections.len(), 3);
915        assert_eq!(case.sections[0].kind, SectionKind::Events);
916        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
917        assert_eq!(case.sections[2].kind, SectionKind::Timeline);
918    }
919
920    #[test]
921    fn parse_empty_summary() {
922        let input = [
923            "---",
924            "id: test",
925            "sources: []",
926            "---",
927            "",
928            "# Title",
929            "",
930            "## Events",
931            "",
932        ]
933        .join("\n");
934
935        let case = parse(&input).unwrap_or_else(|errs| {
936            panic!(
937                "parse failed: {}",
938                errs.iter()
939                    .map(ToString::to_string)
940                    .collect::<Vec<_>>()
941                    .join("; ")
942            );
943        });
944        assert_eq!(case.summary, "");
945    }
946
947    #[test]
948    fn parse_multiline_summary() {
949        let input = [
950            "---",
951            "id: test",
952            "sources: []",
953            "---",
954            "",
955            "# Title",
956            "",
957            "First line of summary.",
958            "Second line of summary.",
959            "",
960            "## Events",
961            "",
962        ]
963        .join("\n");
964
965        let case = parse(&input).unwrap_or_else(|errs| {
966            panic!(
967                "parse failed: {}",
968                errs.iter()
969                    .map(ToString::to_string)
970                    .collect::<Vec<_>>()
971                    .join("; ")
972            );
973        });
974        assert_eq!(
975            case.summary,
976            "First line of summary.\nSecond line of summary."
977        );
978    }
979
980    #[test]
981    fn strip_heading_levels() {
982        assert_eq!(strip_heading("# Title", 1), Some("Title"));
983        assert_eq!(strip_heading("## Section", 2), Some("Section"));
984        assert_eq!(strip_heading("### Entity", 3), Some("Entity"));
985        // H3 should not match H2
986        assert_eq!(strip_heading("### Entity", 2), None);
987        // H2 should not match H1
988        assert_eq!(strip_heading("## Section", 1), None);
989        // Not a heading
990        assert_eq!(strip_heading("Normal text", 1), None);
991    }
992
993    #[test]
994    fn kebab_case_validation() {
995        assert!(is_kebab_case("valid-case-id"));
996        assert!(is_kebab_case("a"));
997        assert!(is_kebab_case("test-123"));
998        assert!(!is_kebab_case(""));
999        assert!(!is_kebab_case("-leading"));
1000        assert!(!is_kebab_case("trailing-"));
1001        assert!(!is_kebab_case("double--dash"));
1002        assert!(!is_kebab_case("Upper"));
1003        assert!(!is_kebab_case("has space"));
1004    }
1005
1006    #[test]
1007    fn section_body_content() {
1008        let input = [
1009            "---",
1010            "id: test",
1011            "sources: []",
1012            "---",
1013            "",
1014            "# Title",
1015            "",
1016            "## Events",
1017            "",
1018            "### Bonnick dismissal",
1019            "- occurred_at: 2024-12-24",
1020            "- type: termination",
1021            "",
1022        ]
1023        .join("\n");
1024
1025        let case = parse(&input).unwrap_or_else(|errs| {
1026            panic!(
1027                "parse failed: {}",
1028                errs.iter()
1029                    .map(ToString::to_string)
1030                    .collect::<Vec<_>>()
1031                    .join("; ")
1032            );
1033        });
1034
1035        assert_eq!(case.sections.len(), 1);
1036        let body = &case.sections[0].body;
1037        assert!(body.contains("### Bonnick dismissal"));
1038        assert!(body.contains("- occurred_at: 2024-12-24"));
1039    }
1040
1041    #[test]
1042    fn parse_rejects_people_section_in_case_file() {
1043        let input = [
1044            "---",
1045            "id: test",
1046            "sources: []",
1047            "---",
1048            "",
1049            "# Title",
1050            "",
1051            "## People",
1052            "",
1053        ]
1054        .join("\n");
1055        let errs = parse(&input).unwrap_err();
1056        assert!(
1057            errs.iter()
1058                .any(|e| e.message.contains("not allowed in case files"))
1059        );
1060    }
1061
1062    #[test]
1063    fn parse_rejects_organizations_section_in_case_file() {
1064        let input = [
1065            "---",
1066            "id: test",
1067            "sources: []",
1068            "---",
1069            "",
1070            "# Title",
1071            "",
1072            "## Organizations",
1073            "",
1074        ]
1075        .join("\n");
1076        let errs = parse(&input).unwrap_err();
1077        assert!(
1078            errs.iter()
1079                .any(|e| e.message.contains("not allowed in case files"))
1080        );
1081    }
1082
1083    #[test]
1084    fn parse_entity_file_with_id() {
1085        let input = [
1086            "---",
1087            "id: 01JXYZ123456789ABCDEFGHIJK",
1088            "---",
1089            "",
1090            "# Mark Bonnick",
1091            "",
1092            "- qualifier: Arsenal Kit Manager",
1093            "- nationality: British",
1094            "",
1095        ]
1096        .join("\n");
1097
1098        let result = parse_entity_file(&input).unwrap();
1099        assert_eq!(result.id.as_deref(), Some("01JXYZ123456789ABCDEFGHIJK"));
1100        assert_eq!(result.name, "Mark Bonnick");
1101        assert!(result.body.contains("- qualifier: Arsenal Kit Manager"));
1102        assert!(result.body.contains("- nationality: British"));
1103    }
1104
1105    #[test]
1106    fn parse_entity_file_without_id() {
1107        let input = [
1108            "---",
1109            "---",
1110            "",
1111            "# Arsenal FC",
1112            "",
1113            "- qualifier: English Football Club",
1114            "- org_type: sports_club",
1115            "",
1116        ]
1117        .join("\n");
1118
1119        let result = parse_entity_file(&input).unwrap();
1120        assert!(result.id.is_none());
1121        assert_eq!(result.name, "Arsenal FC");
1122    }
1123
1124    #[test]
1125    fn parse_entity_file_no_front_matter() {
1126        let input = ["# Bob Smith", "", "- nationality: Dutch", ""].join("\n");
1127
1128        let result = parse_entity_file(&input).unwrap();
1129        assert!(result.id.is_none());
1130        assert_eq!(result.name, "Bob Smith");
1131        assert!(result.body.contains("- nationality: Dutch"));
1132    }
1133
1134    #[test]
1135    fn parse_entity_file_rejects_h2_sections() {
1136        let input = [
1137            "---",
1138            "---",
1139            "",
1140            "# Test Entity",
1141            "",
1142            "## Relationships",
1143            "",
1144        ]
1145        .join("\n");
1146
1147        let errs = parse_entity_file(&input).unwrap_err();
1148        assert!(errs.iter().any(|e| e.message.contains("H2 sections")));
1149    }
1150
1151    #[test]
1152    fn parse_entity_file_missing_h1() {
1153        let input = ["---", "---", "", "- nationality: Dutch", ""].join("\n");
1154
1155        let errs = parse_entity_file(&input).unwrap_err();
1156        assert!(errs.iter().any(|e| e.message.contains("missing H1")));
1157    }
1158}
weave_content/parser.rs

weave_content/
parser.rs