weave_content/
parser.rs

1#![allow(clippy::module_name_repetitions)]
2
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7/// Maximum length of a case ID (kebab-case identifier).
8const MAX_CASE_ID_LEN: usize = 60;
9
10/// Maximum number of sources in front matter.
11const MAX_SOURCES: usize = 20;
12
13/// Maximum length of the case title (H1).
14const MAX_TITLE_LEN: usize = 200;
15
16/// Maximum length of the case summary.
17const MAX_SUMMARY_LEN: usize = 2000;
18
19/// Known H2 section names for case files (case-insensitive match).
20/// People and Organizations are no longer allowed in case files -- they
21/// live in standalone entity files under `people/` and `organizations/`.
22const KNOWN_CASE_SECTIONS: &[&str] =
23    &["Events", "Documents", "Assets", "Relationships", "Timeline"];
24
25/// A parsed case file with front matter, title, summary, and raw sections.
26#[derive(Debug)]
27pub struct ParsedCase {
28    pub id: String,
29    pub sources: Vec<SourceEntry>,
30    pub title: String,
31    pub summary: String,
32    pub sections: Vec<Section>,
33    /// Case type from front matter (e.g. `corruption`, `fraud`).
34    pub case_type: Option<String>,
35    /// Case status from front matter (e.g. `open`, `trial`).
36    pub status: Option<String>,
37    /// Tags from front matter for categorization.
38    pub tags: Vec<String>,
39}
40
41/// A raw H2 section with its heading text and body content.
42#[derive(Debug)]
43pub struct Section {
44    pub kind: SectionKind,
45    pub body: String,
46    /// Line number (1-indexed) where the H2 heading appears in the original file.
47    pub line: usize,
48}
49
50/// The type of an H2 section, mapped from heading text.
51#[derive(Debug, Clone, Copy, PartialEq, Eq)]
52pub enum SectionKind {
53    People,
54    Organizations,
55    Events,
56    Documents,
57    Assets,
58    Relationships,
59    Timeline,
60}
61
62impl SectionKind {
63    fn from_heading(heading: &str) -> Option<Self> {
64        match heading.trim() {
65            s if s.eq_ignore_ascii_case("People") => Some(Self::People),
66            s if s.eq_ignore_ascii_case("Organizations") => Some(Self::Organizations),
67            s if s.eq_ignore_ascii_case("Events") => Some(Self::Events),
68            s if s.eq_ignore_ascii_case("Documents") => Some(Self::Documents),
69            s if s.eq_ignore_ascii_case("Assets") => Some(Self::Assets),
70            s if s.eq_ignore_ascii_case("Relationships") => Some(Self::Relationships),
71            s if s.eq_ignore_ascii_case("Timeline") => Some(Self::Timeline),
72            _ => None,
73        }
74    }
75
76    /// Whether this section kind is valid in case files.
77    /// People and Organizations are no longer allowed in case files.
78    pub fn is_case_section(self) -> bool {
79        matches!(
80            self,
81            Self::Events | Self::Documents | Self::Assets | Self::Relationships | Self::Timeline
82        )
83    }
84}
85
86/// A parser error with file location.
87#[derive(Debug)]
88pub struct ParseError {
89    pub line: usize,
90    pub message: String,
91}
92
93impl fmt::Display for ParseError {
94    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
95        write!(f, "line {}: {}", self.line, self.message)
96    }
97}
98
99/// Maximum number of tags per case file.
100const MAX_CASE_TAGS: usize = 10;
101
102/// Maximum number of tags per entity file.
103const MAX_ENTITY_TAGS: usize = 5;
104
105/// Maximum length of a single tag.
106const MAX_TAG_LEN: usize = 50;
107
108/// YAML front matter schema.
109#[derive(Deserialize)]
110struct FrontMatter {
111    id: String,
112    #[serde(default)]
113    sources: Vec<SourceEntry>,
114    #[serde(default)]
115    case_type: Option<String>,
116    #[serde(default)]
117    status: Option<String>,
118    #[serde(default)]
119    tags: Vec<String>,
120}
121
122/// A source entry in front matter. Supports both bare URL strings and
123/// structured objects with metadata.
124#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
125#[serde(untagged)]
126pub enum SourceEntry {
127    /// Plain URL string (backward-compatible).
128    Url(String),
129    /// Structured source with metadata.
130    Structured {
131        url: String,
132        #[serde(default)]
133        title: Option<String>,
134        #[serde(default)]
135        published_at: Option<String>,
136        #[serde(default)]
137        language: Option<String>,
138    },
139}
140
141impl SourceEntry {
142    /// Get the URL from any source entry variant.
143    pub fn url(&self) -> &str {
144        match self {
145            Self::Url(u) => u,
146            Self::Structured { url, .. } => url,
147        }
148    }
149}
150
151/// YAML front matter schema for standalone entity files.
152/// Only contains an optional `id` field (NULID, generated on first build).
153#[derive(Deserialize)]
154struct EntityFrontMatter {
155    #[serde(default)]
156    id: Option<String>,
157    #[serde(default)]
158    tags: Vec<String>,
159}
160
161/// A parsed standalone entity file (actor or institution).
162#[derive(Debug)]
163pub struct ParsedEntityFile {
164    /// Stored NULID from front matter (None if not yet generated).
165    pub id: Option<String>,
166    /// Entity name from H1 heading.
167    pub name: String,
168    /// Raw bullet field lines (body after H1, no sections).
169    pub body: String,
170    /// Line number of the H1 heading in the original file.
171    pub title_line: usize,
172    /// Tags from front matter.
173    pub tags: Vec<String>,
174}
175
176/// Parse a Markdown case file into a `ParsedCase`.
177///
178/// Extracts YAML front matter, H1 title, summary, and H2 sections.
179/// Returns errors for malformed structure or boundary violations.
180pub fn parse(input: &str) -> Result<ParsedCase, Vec<ParseError>> {
181    let mut errors = Vec::new();
182
183    // Extract front matter
184    let (front_matter, body_start_line, body) = extract_front_matter(input, &mut errors);
185
186    let Some(front_matter) = front_matter else {
187        if errors.is_empty() {
188            errors.push(ParseError {
189                line: 1,
190                message: "missing YAML front matter (expected `---` delimiter)".into(),
191            });
192        }
193        return Err(errors);
194    };
195
196    // Validate front matter fields
197    validate_front_matter(&front_matter, &mut errors);
198
199    // Extract title, summary, and sections from body
200    let (title, summary, sections) = extract_body(&body, body_start_line, &mut errors);
201
202    if !errors.is_empty() {
203        return Err(errors);
204    }
205
206    Ok(ParsedCase {
207        id: front_matter.id,
208        sources: front_matter.sources,
209        title,
210        summary,
211        sections,
212        case_type: front_matter.case_type,
213        status: front_matter.status,
214        tags: front_matter.tags,
215    })
216}
217
218/// Parse a standalone entity file (actor or institution).
219///
220/// Entity files have YAML front matter with optional `id:`, an H1 name,
221/// and bullet fields directly in the body. No H2 sections are allowed.
222pub fn parse_entity_file(input: &str) -> Result<ParsedEntityFile, Vec<ParseError>> {
223    let mut errors = Vec::new();
224
225    let (front_matter, body_start_line, body) = extract_entity_front_matter(input, &mut errors);
226
227    let id = front_matter.as_ref().and_then(|fm| fm.id.clone());
228    let tags = front_matter.map_or_else(Vec::new, |fm| fm.tags);
229
230    // Validate entity tags
231    if tags.len() > MAX_ENTITY_TAGS {
232        errors.push(ParseError {
233            line: 2,
234            message: format!(
235                "front matter `tags` exceeds {MAX_ENTITY_TAGS} entries (got {})",
236                tags.len()
237            ),
238        });
239    }
240    for (i, tag) in tags.iter().enumerate() {
241        if tag.len() > MAX_TAG_LEN {
242            errors.push(ParseError {
243                line: 2,
244                message: format!("front matter tag #{} exceeds {MAX_TAG_LEN} chars", i + 1),
245            });
246        }
247        if tag.is_empty() {
248            errors.push(ParseError {
249                line: 2,
250                message: format!("front matter tag #{} is empty", i + 1),
251            });
252        }
253    }
254
255    // Extract H1 title and body content (no sections allowed)
256    let (name, title_line, field_body) = extract_entity_body(&body, body_start_line, &mut errors);
257
258    if !errors.is_empty() {
259        return Err(errors);
260    }
261
262    Ok(ParsedEntityFile {
263        id,
264        name,
265        body: field_body,
266        title_line,
267        tags,
268    })
269}
270
271/// Extract YAML front matter for entity files.
272/// Front matter is optional for entity files -- if absent, returns None with no error.
273fn extract_entity_front_matter(
274    input: &str,
275    errors: &mut Vec<ParseError>,
276) -> (Option<EntityFrontMatter>, usize, String) {
277    let lines: Vec<&str> = input.lines().collect();
278
279    let first_delim = lines.iter().position(|l| l.trim() == "---");
280    if first_delim != Some(0) {
281        // No front matter -- entire file is body, starting at line 1
282        return (None, 1, input.to_string());
283    }
284
285    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
286    let Some(close_offset) = close_delim else {
287        errors.push(ParseError {
288            line: 1,
289            message: "unclosed YAML front matter (missing closing `---`)".into(),
290        });
291        return (None, 1, String::new());
292    };
293
294    let close_line = close_offset + 1;
295    let yaml_str: String = lines[1..close_line].join("\n");
296    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
297    let body = lines[close_line + 1..].join("\n");
298
299    match serde_yaml::from_str::<EntityFrontMatter>(&yaml_str) {
300        Ok(fm) => (Some(fm), body_start_line, body),
301        Err(e) => {
302            errors.push(ParseError {
303                line: 2,
304                message: format!("invalid YAML front matter: {e}"),
305            });
306            (None, body_start_line, body)
307        }
308    }
309}
310
311/// Extract H1 name and field body from an entity file.
312/// Rejects any H2 sections.
313fn extract_entity_body(
314    body: &str,
315    body_start_line: usize,
316    errors: &mut Vec<ParseError>,
317) -> (String, usize, String) {
318    let lines: Vec<&str> = body.lines().collect();
319    let mut name = String::new();
320    let mut title_found = false;
321    let mut title_line = body_start_line;
322    let mut field_lines: Vec<&str> = Vec::new();
323
324    for (i, line) in lines.iter().enumerate() {
325        let file_line = body_start_line + i;
326
327        if let Some(heading) = strip_heading(line, 1) {
328            if title_found {
329                errors.push(ParseError {
330                    line: file_line,
331                    message: "multiple H1 headings found (expected exactly one)".into(),
332                });
333                continue;
334            }
335            name = heading.to_string();
336            title_found = true;
337            title_line = file_line;
338            continue;
339        }
340
341        // Reject H2 sections in entity files
342        if strip_heading(line, 2).is_some() {
343            errors.push(ParseError {
344                line: file_line,
345                message: "H2 sections are not allowed in entity files".into(),
346            });
347            continue;
348        }
349
350        if title_found {
351            field_lines.push(line);
352        } else if !line.trim().is_empty() {
353            errors.push(ParseError {
354                line: file_line,
355                message: "expected H1 heading (# Name)".into(),
356            });
357        }
358    }
359
360    if !title_found {
361        errors.push(ParseError {
362            line: body_start_line,
363            message: "missing H1 heading".into(),
364        });
365    } else if name.len() > MAX_TITLE_LEN {
366        errors.push(ParseError {
367            line: title_line,
368            message: format!("H1 name exceeds {MAX_TITLE_LEN} chars (got {})", name.len()),
369        });
370    }
371
372    (name, title_line, field_lines.join("\n"))
373}
374
375/// Extract YAML front matter delimited by `---` lines.
376/// Returns the parsed front matter, the line number where the body starts,
377/// and the body text.
378fn extract_front_matter(
379    input: &str,
380    errors: &mut Vec<ParseError>,
381) -> (Option<FrontMatter>, usize, String) {
382    let lines: Vec<&str> = input.lines().collect();
383
384    // First non-empty line must be `---`
385    let first_delim = lines.iter().position(|l| l.trim() == "---");
386    if first_delim != Some(0) {
387        errors.push(ParseError {
388            line: 1,
389            message: "missing YAML front matter (expected `---` on first line)".into(),
390        });
391        return (None, 1, input.to_string());
392    }
393
394    // Find closing `---`
395    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
396    let Some(close_offset) = close_delim else {
397        errors.push(ParseError {
398            line: 1,
399            message: "unclosed YAML front matter (missing closing `---`)".into(),
400        });
401        return (None, 1, String::new());
402    };
403
404    let close_line = close_offset + 1; // index in `lines`
405    let yaml_str: String = lines[1..close_line].join("\n");
406    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
407    let body = lines[close_line + 1..].join("\n");
408
409    match serde_yaml::from_str::<FrontMatter>(&yaml_str) {
410        Ok(fm) => (Some(fm), body_start_line, body),
411        Err(e) => {
412            errors.push(ParseError {
413                line: 2,
414                message: format!("invalid YAML front matter: {e}"),
415            });
416            (None, body_start_line, body)
417        }
418    }
419}
420
421fn validate_front_matter(fm: &FrontMatter, errors: &mut Vec<ParseError>) {
422    // Validate case ID
423    if fm.id.is_empty() {
424        errors.push(ParseError {
425            line: 2,
426            message: "front matter `id` must not be empty".into(),
427        });
428    } else if fm.id.len() > MAX_CASE_ID_LEN {
429        errors.push(ParseError {
430            line: 2,
431            message: format!(
432                "front matter `id` exceeds {MAX_CASE_ID_LEN} chars (got {})",
433                fm.id.len()
434            ),
435        });
436    } else if !is_kebab_case(&fm.id) {
437        errors.push(ParseError {
438            line: 2,
439            message: format!(
440                "front matter `id` must be kebab-case [a-z0-9-], got {:?}",
441                fm.id
442            ),
443        });
444    }
445
446    // Validate sources count
447    if fm.sources.len() > MAX_SOURCES {
448        errors.push(ParseError {
449            line: 2,
450            message: format!(
451                "front matter `sources` exceeds {MAX_SOURCES} entries (got {})",
452                fm.sources.len()
453            ),
454        });
455    }
456
457    // Validate each source URL is HTTPS
458    for (i, source) in fm.sources.iter().enumerate() {
459        if !source.url().starts_with("https://") {
460            errors.push(ParseError {
461                line: 2,
462                message: format!("source[{i}] must be HTTPS, got {:?}", source.url()),
463            });
464        }
465    }
466
467    // Validate case_type
468    if let Some(ct) = &fm.case_type {
469        use crate::domain::CaseType;
470        let normalized = ct.to_lowercase().replace(' ', "_");
471        if !CaseType::KNOWN.contains(&normalized.as_str())
472            && crate::domain::parse_custom(ct).is_none()
473        {
474            errors.push(ParseError {
475                line: 2,
476                message: format!(
477                    "invalid case_type {:?} (known: {}; use \"custom:Value\" for custom)",
478                    ct,
479                    CaseType::KNOWN.join(", ")
480                ),
481            });
482        }
483    }
484
485    // Validate status
486    if let Some(st) = &fm.status {
487        use crate::domain::CaseStatus;
488        let normalized = st.to_lowercase().replace(' ', "_");
489        if !CaseStatus::KNOWN.contains(&normalized.as_str()) {
490            errors.push(ParseError {
491                line: 2,
492                message: format!(
493                    "invalid status {:?} (known: {})",
494                    st,
495                    CaseStatus::KNOWN.join(", ")
496                ),
497            });
498        }
499    }
500
501    // Validate tags
502    if fm.tags.len() > MAX_CASE_TAGS {
503        errors.push(ParseError {
504            line: 2,
505            message: format!(
506                "front matter `tags` exceeds {MAX_CASE_TAGS} entries (got {})",
507                fm.tags.len()
508            ),
509        });
510    }
511    for (i, tag) in fm.tags.iter().enumerate() {
512        if tag.len() > MAX_TAG_LEN {
513            errors.push(ParseError {
514                line: 2,
515                message: format!("tag[{i}] exceeds {MAX_TAG_LEN} chars (got {})", tag.len()),
516            });
517        }
518        if tag.is_empty() {
519            errors.push(ParseError {
520                line: 2,
521                message: format!("tag[{i}] must not be empty"),
522            });
523        }
524    }
525}
526
527/// Check if a string is valid kebab-case: `[a-z0-9](-[a-z0-9]+)*`
528fn is_kebab_case(s: &str) -> bool {
529    !s.is_empty()
530        && !s.starts_with('-')
531        && !s.ends_with('-')
532        && !s.contains("--")
533        && s.chars()
534            .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-')
535}
536
537/// Extract the H1 title, summary text, and H2 sections from the body.
538#[allow(clippy::too_many_lines)]
539fn extract_body(
540    body: &str,
541    body_start_line: usize,
542    errors: &mut Vec<ParseError>,
543) -> (String, String, Vec<Section>) {
544    let lines: Vec<&str> = body.lines().collect();
545    let mut title = String::new();
546    let mut title_found = false;
547    let mut summary_lines: Vec<&str> = Vec::new();
548    let mut sections: Vec<Section> = Vec::new();
549
550    // Track current H2 section being built
551    let mut current_section_kind: Option<SectionKind> = None;
552    let mut current_section_line: usize = 0;
553    let mut current_section_body: Vec<&str> = Vec::new();
554
555    // State: before H1, after H1 (summary), in sections
556    let mut state = State::BeforeTitle;
557
558    for (i, line) in lines.iter().enumerate() {
559        let file_line = body_start_line + i; // 1-indexed line in original file
560
561        if let Some(heading) = strip_heading(line, 1) {
562            if title_found {
563                errors.push(ParseError {
564                    line: file_line,
565                    message: "multiple H1 headings found (expected exactly one)".into(),
566                });
567                continue;
568            }
569            title = heading.to_string();
570            title_found = true;
571            state = State::Summary;
572            continue;
573        }
574
575        if let Some(heading) = strip_heading(line, 2) {
576            // Flush previous section
577            if let Some(kind) = current_section_kind.take() {
578                sections.push(Section {
579                    kind,
580                    body: current_section_body.join("\n"),
581                    line: current_section_line,
582                });
583                current_section_body.clear();
584            }
585
586            match SectionKind::from_heading(heading) {
587                Some(kind) if kind.is_case_section() => {
588                    // Check for duplicate sections
589                    if sections.iter().any(|s| s.kind == kind) {
590                        errors.push(ParseError {
591                            line: file_line,
592                            message: format!("duplicate section: ## {heading}"),
593                        });
594                    }
595                    current_section_kind = Some(kind);
596                    current_section_line = file_line;
597                    state = State::InSection;
598                }
599                Some(_) => {
600                    // Legacy section (People/Organizations) -- not allowed in case files
601                    errors.push(ParseError {
602                        line: file_line,
603                        message: format!(
604                            "## {heading} is not allowed in case files (use standalone entity files in people/ or organizations/ instead)"
605                        ),
606                    });
607                }
608                None => {
609                    errors.push(ParseError {
610                        line: file_line,
611                        message: format!(
612                            "unknown section: ## {heading} (expected one of: {})",
613                            KNOWN_CASE_SECTIONS.join(", ")
614                        ),
615                    });
616                }
617            }
618            continue;
619        }
620
621        match state {
622            State::BeforeTitle => {
623                // Skip blank lines before title
624                if !line.trim().is_empty() {
625                    errors.push(ParseError {
626                        line: file_line,
627                        message: "expected H1 title (# Title)".into(),
628                    });
629                }
630            }
631            State::Summary => {
632                summary_lines.push(line);
633            }
634            State::InSection => {
635                current_section_body.push(line);
636            }
637        }
638    }
639
640    // Flush last section
641    if let Some(kind) = current_section_kind.take() {
642        sections.push(Section {
643            kind,
644            body: current_section_body.join("\n"),
645            line: current_section_line,
646        });
647    }
648
649    // Validate title
650    if !title_found {
651        errors.push(ParseError {
652            line: body_start_line,
653            message: "missing H1 title".into(),
654        });
655    } else if title.len() > MAX_TITLE_LEN {
656        errors.push(ParseError {
657            line: body_start_line,
658            message: format!(
659                "H1 title exceeds {MAX_TITLE_LEN} chars (got {})",
660                title.len()
661            ),
662        });
663    }
664
665    // Build summary (trim leading/trailing blank lines)
666    let summary = summary_lines.clone().join("\n").trim().to_string();
667
668    if summary.len() > MAX_SUMMARY_LEN {
669        errors.push(ParseError {
670            line: body_start_line,
671            message: format!(
672                "summary exceeds {MAX_SUMMARY_LEN} chars (got {})",
673                summary.len()
674            ),
675        });
676    }
677
678    (title, summary, sections)
679}
680
681#[derive(Clone, Copy)]
682enum State {
683    BeforeTitle,
684    Summary,
685    InSection,
686}
687
688/// Strip an ATX heading prefix of the given level. Returns the heading text.
689/// E.g., `strip_heading("## Foo", 2)` returns `Some("Foo")`.
690fn strip_heading(line: &str, level: usize) -> Option<&str> {
691    let prefix = "#".repeat(level);
692    let trimmed = line.trim_start();
693    if trimmed.starts_with(&prefix) {
694        let after = &trimmed[prefix.len()..];
695        // Must be followed by space or end of line, and NOT more `#` chars
696        if after.is_empty() {
697            return Some("");
698        }
699        if after.starts_with(' ') && !after.starts_with(" #") {
700            // Actually, need to exclude `### Foo` when looking for `## Foo`
701            return Some(after[1..].trim());
702        }
703        // Check: `###` should not match `##`
704        if after.starts_with('#') {
705            return None;
706        }
707    }
708    None
709}
710
711#[cfg(test)]
712mod tests {
713    use super::*;
714
715    fn minimal_case() -> String {
716        [
717            "---",
718            "id: test-case",
719            "sources:",
720            "  - https://example.com/source",
721            "---",
722            "",
723            "# Test Case Title",
724            "",
725            "This is the summary.",
726            "",
727            "## Events",
728            "",
729            "### Something happened",
730            "- occurred_at: 2025-01-01",
731            "",
732            "## Relationships",
733            "",
734            "- Something happened -> Something happened: associate_of",
735        ]
736        .join("\n")
737    }
738
739    #[test]
740    fn parse_minimal_case() {
741        let result = parse(&minimal_case());
742        let case = result.unwrap_or_else(|errs| {
743            panic!(
744                "parse failed: {}",
745                errs.iter()
746                    .map(ToString::to_string)
747                    .collect::<Vec<_>>()
748                    .join("; ")
749            );
750        });
751
752        assert_eq!(case.id, "test-case");
753        assert_eq!(case.sources.len(), 1);
754        assert_eq!(case.sources[0].url(), "https://example.com/source");
755        assert_eq!(case.title, "Test Case Title");
756        assert_eq!(case.summary, "This is the summary.");
757        assert_eq!(case.sections.len(), 2);
758        assert_eq!(case.sections[0].kind, SectionKind::Events);
759        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
760    }
761
762    #[test]
763    fn parse_missing_front_matter() {
764        let input = "# Title\n\nSummary.\n";
765        let errs = parse(input).unwrap_err();
766        assert!(errs.iter().any(|e| e.message.contains("front matter")));
767    }
768
769    #[test]
770    fn parse_unclosed_front_matter() {
771        let input = "---\nid: test\n# Title\n";
772        let errs = parse(input).unwrap_err();
773        assert!(errs.iter().any(|e| e.message.contains("unclosed")));
774    }
775
776    #[test]
777    fn parse_invalid_case_id_uppercase() {
778        let input = "---\nid: Test-Case\nsources: []\n---\n\n# Title\n";
779        let errs = parse(input).unwrap_err();
780        assert!(errs.iter().any(|e| e.message.contains("kebab-case")));
781    }
782
783    #[test]
784    fn parse_case_id_too_long() {
785        let long_id = "a".repeat(61);
786        let input = format!("---\nid: {long_id}\nsources: []\n---\n\n# Title\n");
787        let errs = parse(&input).unwrap_err();
788        assert!(errs.iter().any(|e| e.message.contains("exceeds 60")));
789    }
790
791    #[test]
792    fn parse_non_https_source() {
793        let input = "---\nid: test\nsources:\n  - http://example.com\n---\n\n# Title\n";
794        let errs = parse(input).unwrap_err();
795        assert!(errs.iter().any(|e| e.message.contains("HTTPS")));
796    }
797
798    #[test]
799    fn parse_too_many_sources() {
800        let sources: Vec<String> = (0..21)
801            .map(|i| format!("  - https://example.com/{i}"))
802            .collect();
803        let input = format!(
804            "---\nid: test\nsources:\n{}\n---\n\n# Title\n",
805            sources.join("\n")
806        );
807        let errs = parse(&input).unwrap_err();
808        assert!(errs.iter().any(|e| e.message.contains("exceeds 20")));
809    }
810
811    #[test]
812    fn parse_unknown_section() {
813        let input = [
814            "---",
815            "id: test",
816            "sources: []",
817            "---",
818            "",
819            "# Title",
820            "",
821            "## Unknown Section",
822            "",
823        ]
824        .join("\n");
825        let errs = parse(&input).unwrap_err();
826        assert!(errs.iter().any(|e| e.message.contains("unknown section")));
827    }
828
829    #[test]
830    fn parse_duplicate_section() {
831        let input = [
832            "---",
833            "id: test",
834            "sources: []",
835            "---",
836            "",
837            "# Title",
838            "",
839            "## Events",
840            "",
841            "## Events",
842            "",
843        ]
844        .join("\n");
845        let errs = parse(&input).unwrap_err();
846        assert!(errs.iter().any(|e| e.message.contains("duplicate")));
847    }
848
849    #[test]
850    fn parse_multiple_h1() {
851        let input = [
852            "---",
853            "id: test",
854            "sources: []",
855            "---",
856            "",
857            "# First Title",
858            "",
859            "# Second Title",
860            "",
861        ]
862        .join("\n");
863        let errs = parse(&input).unwrap_err();
864        assert!(errs.iter().any(|e| e.message.contains("multiple H1")));
865    }
866
867    #[test]
868    fn parse_all_sections() {
869        let input = [
870            "---",
871            "id: full-case",
872            "sources:",
873            "  - https://example.com/a",
874            "---",
875            "",
876            "# Full Case",
877            "",
878            "Summary text here.",
879            "",
880            "## Events",
881            "",
882            "### Something happened",
883            "- occurred_at: 2025-01-01",
884            "",
885            "## Relationships",
886            "",
887            "- Alice -> Corp Inc: employed_by",
888            "",
889            "## Timeline",
890            "",
891            "Something happened",
892        ]
893        .join("\n");
894
895        let case = parse(&input).unwrap_or_else(|errs| {
896            panic!(
897                "parse failed: {}",
898                errs.iter()
899                    .map(ToString::to_string)
900                    .collect::<Vec<_>>()
901                    .join("; ")
902            );
903        });
904
905        assert_eq!(case.id, "full-case");
906        assert_eq!(case.title, "Full Case");
907        assert_eq!(case.summary, "Summary text here.");
908        assert_eq!(case.sections.len(), 3);
909        assert_eq!(case.sections[0].kind, SectionKind::Events);
910        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
911        assert_eq!(case.sections[2].kind, SectionKind::Timeline);
912    }
913
914    #[test]
915    fn parse_empty_summary() {
916        let input = [
917            "---",
918            "id: test",
919            "sources: []",
920            "---",
921            "",
922            "# Title",
923            "",
924            "## Events",
925            "",
926        ]
927        .join("\n");
928
929        let case = parse(&input).unwrap_or_else(|errs| {
930            panic!(
931                "parse failed: {}",
932                errs.iter()
933                    .map(ToString::to_string)
934                    .collect::<Vec<_>>()
935                    .join("; ")
936            );
937        });
938        assert_eq!(case.summary, "");
939    }
940
941    #[test]
942    fn parse_multiline_summary() {
943        let input = [
944            "---",
945            "id: test",
946            "sources: []",
947            "---",
948            "",
949            "# Title",
950            "",
951            "First line of summary.",
952            "Second line of summary.",
953            "",
954            "## Events",
955            "",
956        ]
957        .join("\n");
958
959        let case = parse(&input).unwrap_or_else(|errs| {
960            panic!(
961                "parse failed: {}",
962                errs.iter()
963                    .map(ToString::to_string)
964                    .collect::<Vec<_>>()
965                    .join("; ")
966            );
967        });
968        assert_eq!(
969            case.summary,
970            "First line of summary.\nSecond line of summary."
971        );
972    }
973
974    #[test]
975    fn strip_heading_levels() {
976        assert_eq!(strip_heading("# Title", 1), Some("Title"));
977        assert_eq!(strip_heading("## Section", 2), Some("Section"));
978        assert_eq!(strip_heading("### Entity", 3), Some("Entity"));
979        // H3 should not match H2
980        assert_eq!(strip_heading("### Entity", 2), None);
981        // H2 should not match H1
982        assert_eq!(strip_heading("## Section", 1), None);
983        // Not a heading
984        assert_eq!(strip_heading("Normal text", 1), None);
985    }
986
987    #[test]
988    fn kebab_case_validation() {
989        assert!(is_kebab_case("valid-case-id"));
990        assert!(is_kebab_case("a"));
991        assert!(is_kebab_case("test-123"));
992        assert!(!is_kebab_case(""));
993        assert!(!is_kebab_case("-leading"));
994        assert!(!is_kebab_case("trailing-"));
995        assert!(!is_kebab_case("double--dash"));
996        assert!(!is_kebab_case("Upper"));
997        assert!(!is_kebab_case("has space"));
998    }
999
1000    #[test]
1001    fn section_body_content() {
1002        let input = [
1003            "---",
1004            "id: test",
1005            "sources: []",
1006            "---",
1007            "",
1008            "# Title",
1009            "",
1010            "## Events",
1011            "",
1012            "### Bonnick dismissal",
1013            "- occurred_at: 2024-12-24",
1014            "- type: termination",
1015            "",
1016        ]
1017        .join("\n");
1018
1019        let case = parse(&input).unwrap_or_else(|errs| {
1020            panic!(
1021                "parse failed: {}",
1022                errs.iter()
1023                    .map(ToString::to_string)
1024                    .collect::<Vec<_>>()
1025                    .join("; ")
1026            );
1027        });
1028
1029        assert_eq!(case.sections.len(), 1);
1030        let body = &case.sections[0].body;
1031        assert!(body.contains("### Bonnick dismissal"));
1032        assert!(body.contains("- occurred_at: 2024-12-24"));
1033    }
1034
1035    #[test]
1036    fn parse_rejects_people_section_in_case_file() {
1037        let input = [
1038            "---",
1039            "id: test",
1040            "sources: []",
1041            "---",
1042            "",
1043            "# Title",
1044            "",
1045            "## People",
1046            "",
1047        ]
1048        .join("\n");
1049        let errs = parse(&input).unwrap_err();
1050        assert!(
1051            errs.iter()
1052                .any(|e| e.message.contains("not allowed in case files"))
1053        );
1054    }
1055
1056    #[test]
1057    fn parse_rejects_organizations_section_in_case_file() {
1058        let input = [
1059            "---",
1060            "id: test",
1061            "sources: []",
1062            "---",
1063            "",
1064            "# Title",
1065            "",
1066            "## Organizations",
1067            "",
1068        ]
1069        .join("\n");
1070        let errs = parse(&input).unwrap_err();
1071        assert!(
1072            errs.iter()
1073                .any(|e| e.message.contains("not allowed in case files"))
1074        );
1075    }
1076
1077    #[test]
1078    fn parse_entity_file_with_id() {
1079        let input = [
1080            "---",
1081            "id: 01JXYZ123456789ABCDEFGHIJK",
1082            "---",
1083            "",
1084            "# Mark Bonnick",
1085            "",
1086            "- qualifier: Arsenal Kit Manager",
1087            "- nationality: British",
1088            "",
1089        ]
1090        .join("\n");
1091
1092        let result = parse_entity_file(&input).unwrap();
1093        assert_eq!(result.id.as_deref(), Some("01JXYZ123456789ABCDEFGHIJK"));
1094        assert_eq!(result.name, "Mark Bonnick");
1095        assert!(result.body.contains("- qualifier: Arsenal Kit Manager"));
1096        assert!(result.body.contains("- nationality: British"));
1097    }
1098
1099    #[test]
1100    fn parse_entity_file_without_id() {
1101        let input = [
1102            "---",
1103            "---",
1104            "",
1105            "# Arsenal FC",
1106            "",
1107            "- qualifier: English Football Club",
1108            "- org_type: sports_club",
1109            "",
1110        ]
1111        .join("\n");
1112
1113        let result = parse_entity_file(&input).unwrap();
1114        assert!(result.id.is_none());
1115        assert_eq!(result.name, "Arsenal FC");
1116    }
1117
1118    #[test]
1119    fn parse_entity_file_no_front_matter() {
1120        let input = ["# Bob Smith", "", "- nationality: Dutch", ""].join("\n");
1121
1122        let result = parse_entity_file(&input).unwrap();
1123        assert!(result.id.is_none());
1124        assert_eq!(result.name, "Bob Smith");
1125        assert!(result.body.contains("- nationality: Dutch"));
1126    }
1127
1128    #[test]
1129    fn parse_entity_file_rejects_h2_sections() {
1130        let input = [
1131            "---",
1132            "---",
1133            "",
1134            "# Test Entity",
1135            "",
1136            "## Relationships",
1137            "",
1138        ]
1139        .join("\n");
1140
1141        let errs = parse_entity_file(&input).unwrap_err();
1142        assert!(errs.iter().any(|e| e.message.contains("H2 sections")));
1143    }
1144
1145    #[test]
1146    fn parse_entity_file_missing_h1() {
1147        let input = ["---", "---", "", "- nationality: Dutch", ""].join("\n");
1148
1149        let errs = parse_entity_file(&input).unwrap_err();
1150        assert!(errs.iter().any(|e| e.message.contains("missing H1")));
1151    }
1152}
weave_content/parser.rs

weave_content/
parser.rs