weave_content/
parser.rs

1#![allow(clippy::module_name_repetitions)]
2
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7/// Maximum length of a case NULID (26 chars Crockford Base32).
8const MAX_CASE_ID_LEN: usize = 26;
9
10/// Maximum number of sources in front matter.
11const MAX_SOURCES: usize = 20;
12
13/// Maximum length of the case title (H1).
14const MAX_TITLE_LEN: usize = 200;
15
16/// Maximum length of the case summary.
17const MAX_SUMMARY_LEN: usize = 2000;
18
19/// Known H2 section names for case files (case-insensitive match).
20/// People and Organizations are no longer allowed in case files -- they
21/// live in standalone entity files under `people/` and `organizations/`.
22const KNOWN_CASE_SECTIONS: &[&str] = &[
23    "Events",
24    "Documents",
25    "Assets",
26    "Relationships",
27    "Timeline",
28    "Related Cases",
29];
30
31/// A parsed case file with front matter, title, summary, and raw sections.
32#[derive(Debug)]
33pub struct ParsedCase {
34    /// NULID for the case node (None if not yet generated).
35    pub id: Option<String>,
36    pub sources: Vec<SourceEntry>,
37    pub title: String,
38    pub summary: String,
39    pub sections: Vec<Section>,
40    /// Case type from front matter (e.g. `corruption`, `fraud`).
41    pub case_type: Option<String>,
42    /// Case status from front matter (e.g. `open`, `trial`).
43    pub status: Option<String>,
44    /// Structured amounts DSL string (e.g. `660000 USD bribe | 250000000 IDR fine`).
45    pub amounts: Option<String>,
46    /// Tags from front matter for categorization.
47    pub tags: Vec<String>,
48    /// Related case entries from `## Related Cases` section.
49    pub related_cases: Vec<RelatedCase>,
50}
51
52/// A related case entry from `## Related Cases` section.
53#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
54pub struct RelatedCase {
55    /// Case path relative to content root (e.g. `id/corruption/2002/blbi-liquidity-aid-scandal`).
56    pub case_path: String,
57    /// Description of the relationship between the cases.
58    pub description: String,
59}
60
61/// A raw H2 section with its heading text and body content.
62#[derive(Debug)]
63pub struct Section {
64    pub kind: SectionKind,
65    pub body: String,
66    /// Line number (1-indexed) where the H2 heading appears in the original file.
67    pub line: usize,
68}
69
70/// The type of an H2 section, mapped from heading text.
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum SectionKind {
73    People,
74    Organizations,
75    Events,
76    Documents,
77    Assets,
78    Relationships,
79    Timeline,
80    RelatedCases,
81}
82
83impl SectionKind {
84    fn from_heading(heading: &str) -> Option<Self> {
85        match heading.trim() {
86            s if s.eq_ignore_ascii_case("People") => Some(Self::People),
87            s if s.eq_ignore_ascii_case("Organizations") => Some(Self::Organizations),
88            s if s.eq_ignore_ascii_case("Events") => Some(Self::Events),
89            s if s.eq_ignore_ascii_case("Documents") => Some(Self::Documents),
90            s if s.eq_ignore_ascii_case("Assets") => Some(Self::Assets),
91            s if s.eq_ignore_ascii_case("Relationships") => Some(Self::Relationships),
92            s if s.eq_ignore_ascii_case("Timeline") => Some(Self::Timeline),
93            s if s.eq_ignore_ascii_case("Related Cases") => Some(Self::RelatedCases),
94            _ => None,
95        }
96    }
97
98    /// Whether this section kind is valid in case files.
99    /// People and Organizations are no longer allowed in case files.
100    pub fn is_case_section(self) -> bool {
101        matches!(
102            self,
103            Self::Events
104                | Self::Documents
105                | Self::Assets
106                | Self::Relationships
107                | Self::Timeline
108                | Self::RelatedCases
109        )
110    }
111}
112
113/// A parser error with file location.
114#[derive(Debug)]
115pub struct ParseError {
116    pub line: usize,
117    pub message: String,
118}
119
120impl fmt::Display for ParseError {
121    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
122        write!(f, "line {}: {}", self.line, self.message)
123    }
124}
125
126/// Maximum number of tags per case file.
127const MAX_CASE_TAGS: usize = 10;
128
129/// Maximum number of tags per entity file.
130const MAX_ENTITY_TAGS: usize = 5;
131
132/// Maximum length of a single tag.
133const MAX_TAG_LEN: usize = 50;
134
135/// Maximum number of related case entries per case file.
136const MAX_RELATED_CASES: usize = 10;
137
138/// Maximum length of a related case description.
139const MAX_RELATED_DESCRIPTION_LEN: usize = 500;
140
141/// Parse the body of a `## Related Cases` section into `RelatedCase` entries.
142///
143/// Each entry is a bullet `- <case_path>` followed by an indented
144/// `description: <text>` line. Validates limits and non-empty fields.
145pub fn parse_related_cases(
146    body: &str,
147    section_start_line: usize,
148    errors: &mut Vec<ParseError>,
149) -> Vec<RelatedCase> {
150    let mut entries: Vec<(String, String, usize)> = Vec::new(); // (path, desc, line)
151
152    for (offset, line) in body.lines().enumerate() {
153        let file_line = section_start_line + offset + 1;
154
155        if let Some(rest) = line.strip_prefix("- ") {
156            let case_path = rest.trim().to_string();
157            entries.push((case_path, String::new(), file_line));
158        } else if let Some(rest) = line.strip_prefix("  description: ") {
159            if let Some(entry) = entries.last_mut() {
160                entry.1 = rest.trim().to_string();
161            } else {
162                errors.push(ParseError {
163                    line: file_line,
164                    message: "description without a preceding case path".into(),
165                });
166            }
167        } else if !line.trim().is_empty() {
168            errors.push(ParseError {
169                line: file_line,
170                message: format!("unexpected line in Related Cases: {line}"),
171            });
172        }
173    }
174
175    if entries.len() > MAX_RELATED_CASES {
176        errors.push(ParseError {
177            line: section_start_line,
178            message: format!(
179                "Related Cases exceeds {MAX_RELATED_CASES} entries (got {})",
180                entries.len()
181            ),
182        });
183    }
184
185    let mut result = Vec::new();
186    for (case_path, description, line) in entries {
187        if case_path.is_empty() {
188            errors.push(ParseError {
189                line,
190                message: "related case path must not be empty".into(),
191            });
192            continue;
193        }
194        if description.is_empty() {
195            errors.push(ParseError {
196                line,
197                message: format!("related case {case_path:?} missing description"),
198            });
199            continue;
200        }
201        if description.len() > MAX_RELATED_DESCRIPTION_LEN {
202            errors.push(ParseError {
203                line,
204                message: format!(
205                    "related case description exceeds {MAX_RELATED_DESCRIPTION_LEN} chars (got {})",
206                    description.len()
207                ),
208            });
209            continue;
210        }
211        result.push(RelatedCase {
212            case_path,
213            description,
214        });
215    }
216
217    result
218}
219
220/// YAML front matter schema.
221#[derive(Deserialize)]
222struct FrontMatter {
223    /// NULID for the case node (auto-generated on first build).
224    #[serde(default)]
225    id: Option<String>,
226    #[serde(default)]
227    sources: Vec<SourceEntry>,
228    #[serde(default)]
229    case_type: Option<String>,
230    #[serde(default)]
231    status: Option<String>,
232    #[serde(default)]
233    amounts: Option<String>,
234    #[serde(default)]
235    tags: Vec<String>,
236}
237
238/// A source entry in front matter. Supports both bare URL strings and
239/// structured objects with metadata.
240#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
241#[serde(untagged)]
242pub enum SourceEntry {
243    /// Plain URL string (backward-compatible).
244    Url(String),
245    /// Structured source with metadata.
246    Structured {
247        url: String,
248        #[serde(default)]
249        title: Option<String>,
250        #[serde(default)]
251        published_at: Option<String>,
252        #[serde(default)]
253        language: Option<String>,
254    },
255}
256
257impl SourceEntry {
258    /// Get the URL from any source entry variant.
259    pub fn url(&self) -> &str {
260        match self {
261            Self::Url(u) => u,
262            Self::Structured { url, .. } => url,
263        }
264    }
265}
266
267/// YAML front matter schema for standalone entity files.
268/// Only contains an optional `id` field (NULID, generated on first build).
269#[derive(Deserialize)]
270struct EntityFrontMatter {
271    #[serde(default)]
272    id: Option<String>,
273    #[serde(default)]
274    tags: Vec<String>,
275}
276
277/// A parsed standalone entity file (actor or institution).
278#[derive(Debug)]
279pub struct ParsedEntityFile {
280    /// Stored NULID from front matter (None if not yet generated).
281    pub id: Option<String>,
282    /// Entity name from H1 heading.
283    pub name: String,
284    /// Raw bullet field lines (body after H1, no sections).
285    pub body: String,
286    /// Line number of the H1 heading in the original file.
287    pub title_line: usize,
288    /// Tags from front matter.
289    pub tags: Vec<String>,
290}
291
292/// Parse a Markdown case file into a `ParsedCase`.
293///
294/// Extracts YAML front matter, H1 title, summary, and H2 sections.
295/// Returns errors for malformed structure or boundary violations.
296pub fn parse(input: &str) -> Result<ParsedCase, Vec<ParseError>> {
297    let mut errors = Vec::new();
298
299    // Extract front matter
300    let (front_matter, body_start_line, body) = extract_front_matter(input, &mut errors);
301
302    let Some(front_matter) = front_matter else {
303        if errors.is_empty() {
304            errors.push(ParseError {
305                line: 1,
306                message: "missing YAML front matter (expected `---` delimiter)".into(),
307            });
308        }
309        return Err(errors);
310    };
311
312    // Validate front matter fields
313    validate_front_matter(&front_matter, &mut errors);
314
315    // Extract title, summary, and sections from body
316    let (title, summary, mut sections) = extract_body(&body, body_start_line, &mut errors);
317
318    // Parse Related Cases sections
319    let mut related_cases = Vec::new();
320    for section in &sections {
321        if section.kind == SectionKind::RelatedCases {
322            let entries = parse_related_cases(&section.body, section.line, &mut errors);
323            related_cases.extend(entries);
324        }
325    }
326    // Remove RelatedCases from sections list (consumed)
327    sections.retain(|s| s.kind != SectionKind::RelatedCases);
328
329    if !errors.is_empty() {
330        return Err(errors);
331    }
332
333    Ok(ParsedCase {
334        id: front_matter.id,
335        sources: front_matter.sources,
336        title,
337        summary,
338        sections,
339        case_type: front_matter.case_type,
340        status: front_matter.status,
341        amounts: front_matter.amounts,
342        tags: front_matter.tags,
343        related_cases,
344    })
345}
346
347/// Parse a standalone entity file (actor or institution).
348///
349/// Entity files have YAML front matter with optional `id:`, an H1 name,
350/// and bullet fields directly in the body. No H2 sections are allowed.
351pub fn parse_entity_file(input: &str) -> Result<ParsedEntityFile, Vec<ParseError>> {
352    let mut errors = Vec::new();
353
354    let (front_matter, body_start_line, body) = extract_entity_front_matter(input, &mut errors);
355
356    let id = front_matter.as_ref().and_then(|fm| fm.id.clone());
357    let tags = front_matter.map_or_else(Vec::new, |fm| fm.tags);
358
359    // Validate entity tags
360    if tags.len() > MAX_ENTITY_TAGS {
361        errors.push(ParseError {
362            line: 2,
363            message: format!(
364                "front matter `tags` exceeds {MAX_ENTITY_TAGS} entries (got {})",
365                tags.len()
366            ),
367        });
368    }
369    for (i, tag) in tags.iter().enumerate() {
370        if tag.len() > MAX_TAG_LEN {
371            errors.push(ParseError {
372                line: 2,
373                message: format!("front matter tag #{} exceeds {MAX_TAG_LEN} chars", i + 1),
374            });
375        }
376        if tag.is_empty() {
377            errors.push(ParseError {
378                line: 2,
379                message: format!("front matter tag #{} is empty", i + 1),
380            });
381        }
382    }
383
384    // Extract H1 title and body content (no sections allowed)
385    let (name, title_line, field_body) = extract_entity_body(&body, body_start_line, &mut errors);
386
387    if !errors.is_empty() {
388        return Err(errors);
389    }
390
391    Ok(ParsedEntityFile {
392        id,
393        name,
394        body: field_body,
395        title_line,
396        tags,
397    })
398}
399
400/// Extract YAML front matter for entity files.
401/// Front matter is optional for entity files -- if absent, returns None with no error.
402fn extract_entity_front_matter(
403    input: &str,
404    errors: &mut Vec<ParseError>,
405) -> (Option<EntityFrontMatter>, usize, String) {
406    let lines: Vec<&str> = input.lines().collect();
407
408    let first_delim = lines.iter().position(|l| l.trim() == "---");
409    if first_delim != Some(0) {
410        // No front matter -- entire file is body, starting at line 1
411        return (None, 1, input.to_string());
412    }
413
414    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
415    let Some(close_offset) = close_delim else {
416        errors.push(ParseError {
417            line: 1,
418            message: "unclosed YAML front matter (missing closing `---`)".into(),
419        });
420        return (None, 1, String::new());
421    };
422
423    let close_line = close_offset + 1;
424    let yaml_str: String = lines[1..close_line].join("\n");
425    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
426    let body = lines[close_line + 1..].join("\n");
427
428    match serde_yaml::from_str::<EntityFrontMatter>(&yaml_str) {
429        Ok(fm) => (Some(fm), body_start_line, body),
430        Err(e) => {
431            errors.push(ParseError {
432                line: 2,
433                message: format!("invalid YAML front matter: {e}"),
434            });
435            (None, body_start_line, body)
436        }
437    }
438}
439
440/// Extract H1 name and field body from an entity file.
441/// Rejects any H2 sections.
442fn extract_entity_body(
443    body: &str,
444    body_start_line: usize,
445    errors: &mut Vec<ParseError>,
446) -> (String, usize, String) {
447    let lines: Vec<&str> = body.lines().collect();
448    let mut name = String::new();
449    let mut title_found = false;
450    let mut title_line = body_start_line;
451    let mut field_lines: Vec<&str> = Vec::new();
452
453    for (i, line) in lines.iter().enumerate() {
454        let file_line = body_start_line + i;
455
456        if let Some(heading) = strip_heading(line, 1) {
457            if title_found {
458                errors.push(ParseError {
459                    line: file_line,
460                    message: "multiple H1 headings found (expected exactly one)".into(),
461                });
462                continue;
463            }
464            name = heading.to_string();
465            title_found = true;
466            title_line = file_line;
467            continue;
468        }
469
470        // Reject H2 sections in entity files
471        if strip_heading(line, 2).is_some() {
472            errors.push(ParseError {
473                line: file_line,
474                message: "H2 sections are not allowed in entity files".into(),
475            });
476            continue;
477        }
478
479        if title_found {
480            field_lines.push(line);
481        } else if !line.trim().is_empty() {
482            errors.push(ParseError {
483                line: file_line,
484                message: "expected H1 heading (# Name)".into(),
485            });
486        }
487    }
488
489    if !title_found {
490        errors.push(ParseError {
491            line: body_start_line,
492            message: "missing H1 heading".into(),
493        });
494    } else if name.len() > MAX_TITLE_LEN {
495        errors.push(ParseError {
496            line: title_line,
497            message: format!("H1 name exceeds {MAX_TITLE_LEN} chars (got {})", name.len()),
498        });
499    }
500
501    (name, title_line, field_lines.join("\n"))
502}
503
504/// Extract YAML front matter delimited by `---` lines.
505/// Returns the parsed front matter, the line number where the body starts,
506/// and the body text.
507fn extract_front_matter(
508    input: &str,
509    errors: &mut Vec<ParseError>,
510) -> (Option<FrontMatter>, usize, String) {
511    let lines: Vec<&str> = input.lines().collect();
512
513    // First non-empty line must be `---`
514    let first_delim = lines.iter().position(|l| l.trim() == "---");
515    if first_delim != Some(0) {
516        errors.push(ParseError {
517            line: 1,
518            message: "missing YAML front matter (expected `---` on first line)".into(),
519        });
520        return (None, 1, input.to_string());
521    }
522
523    // Find closing `---`
524    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
525    let Some(close_offset) = close_delim else {
526        errors.push(ParseError {
527            line: 1,
528            message: "unclosed YAML front matter (missing closing `---`)".into(),
529        });
530        return (None, 1, String::new());
531    };
532
533    let close_line = close_offset + 1; // index in `lines`
534    let yaml_str: String = lines[1..close_line].join("\n");
535    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
536    let body = lines[close_line + 1..].join("\n");
537
538    match serde_yaml::from_str::<FrontMatter>(&yaml_str) {
539        Ok(fm) => (Some(fm), body_start_line, body),
540        Err(e) => {
541            errors.push(ParseError {
542                line: 2,
543                message: format!("invalid YAML front matter: {e}"),
544            });
545            (None, body_start_line, body)
546        }
547    }
548}
549
550fn validate_front_matter(fm: &FrontMatter, errors: &mut Vec<ParseError>) {
551    // Validate case ID (NULID) if present
552    if let Some(id) = &fm.id {
553        if id.len() != MAX_CASE_ID_LEN {
554            errors.push(ParseError {
555                line: 2,
556                message: format!(
557                    "front matter `id` must be a {MAX_CASE_ID_LEN}-char NULID, got {} chars",
558                    id.len()
559                ),
560            });
561        }
562    }
563
564    // Validate sources count
565    if fm.sources.len() > MAX_SOURCES {
566        errors.push(ParseError {
567            line: 2,
568            message: format!(
569                "front matter `sources` exceeds {MAX_SOURCES} entries (got {})",
570                fm.sources.len()
571            ),
572        });
573    }
574
575    // Validate each source URL is HTTPS
576    for (i, source) in fm.sources.iter().enumerate() {
577        if !source.url().starts_with("https://") {
578            errors.push(ParseError {
579                line: 2,
580                message: format!("source[{i}] must be HTTPS, got {:?}", source.url()),
581            });
582        }
583    }
584
585    // Validate case_type
586    if let Some(ct) = &fm.case_type {
587        use crate::domain::CaseType;
588        let normalized = ct.to_lowercase().replace(' ', "_");
589        if !CaseType::KNOWN.contains(&normalized.as_str())
590            && crate::domain::parse_custom(ct).is_none()
591        {
592            errors.push(ParseError {
593                line: 2,
594                message: format!(
595                    "invalid case_type {:?} (known: {}; use \"custom:Value\" for custom)",
596                    ct,
597                    CaseType::KNOWN.join(", ")
598                ),
599            });
600        }
601    }
602
603    // Validate status
604    if let Some(st) = &fm.status {
605        use crate::domain::CaseStatus;
606        let normalized = st.to_lowercase().replace(' ', "_");
607        if !CaseStatus::KNOWN.contains(&normalized.as_str()) {
608            errors.push(ParseError {
609                line: 2,
610                message: format!(
611                    "invalid status {:?} (known: {})",
612                    st,
613                    CaseStatus::KNOWN.join(", ")
614                ),
615            });
616        }
617    }
618
619    // Validate tags
620    if fm.tags.len() > MAX_CASE_TAGS {
621        errors.push(ParseError {
622            line: 2,
623            message: format!(
624                "front matter `tags` exceeds {MAX_CASE_TAGS} entries (got {})",
625                fm.tags.len()
626            ),
627        });
628    }
629    for (i, tag) in fm.tags.iter().enumerate() {
630        if tag.len() > MAX_TAG_LEN {
631            errors.push(ParseError {
632                line: 2,
633                message: format!("tag[{i}] exceeds {MAX_TAG_LEN} chars (got {})", tag.len()),
634            });
635        }
636        if tag.is_empty() {
637            errors.push(ParseError {
638                line: 2,
639                message: format!("tag[{i}] must not be empty"),
640            });
641        }
642    }
643}
644
645/// Extract the H1 title, summary text, and H2 sections from the body.
646#[allow(clippy::too_many_lines)]
647fn extract_body(
648    body: &str,
649    body_start_line: usize,
650    errors: &mut Vec<ParseError>,
651) -> (String, String, Vec<Section>) {
652    let lines: Vec<&str> = body.lines().collect();
653    let mut title = String::new();
654    let mut title_found = false;
655    let mut summary_lines: Vec<&str> = Vec::new();
656    let mut sections: Vec<Section> = Vec::new();
657
658    // Track current H2 section being built
659    let mut current_section_kind: Option<SectionKind> = None;
660    let mut current_section_line: usize = 0;
661    let mut current_section_body: Vec<&str> = Vec::new();
662
663    // State: before H1, after H1 (summary), in sections
664    let mut state = State::BeforeTitle;
665
666    for (i, line) in lines.iter().enumerate() {
667        let file_line = body_start_line + i; // 1-indexed line in original file
668
669        if let Some(heading) = strip_heading(line, 1) {
670            if title_found {
671                errors.push(ParseError {
672                    line: file_line,
673                    message: "multiple H1 headings found (expected exactly one)".into(),
674                });
675                continue;
676            }
677            title = heading.to_string();
678            title_found = true;
679            state = State::Summary;
680            continue;
681        }
682
683        if let Some(heading) = strip_heading(line, 2) {
684            // Flush previous section
685            if let Some(kind) = current_section_kind.take() {
686                sections.push(Section {
687                    kind,
688                    body: current_section_body.join("\n"),
689                    line: current_section_line,
690                });
691                current_section_body.clear();
692            }
693
694            match SectionKind::from_heading(heading) {
695                Some(kind) if kind.is_case_section() => {
696                    // Check for duplicate sections
697                    if sections.iter().any(|s| s.kind == kind) {
698                        errors.push(ParseError {
699                            line: file_line,
700                            message: format!("duplicate section: ## {heading}"),
701                        });
702                    }
703                    current_section_kind = Some(kind);
704                    current_section_line = file_line;
705                    state = State::InSection;
706                }
707                Some(_) => {
708                    // Legacy section (People/Organizations) -- not allowed in case files
709                    errors.push(ParseError {
710                        line: file_line,
711                        message: format!(
712                            "## {heading} is not allowed in case files (use standalone entity files in people/ or organizations/ instead)"
713                        ),
714                    });
715                }
716                None => {
717                    errors.push(ParseError {
718                        line: file_line,
719                        message: format!(
720                            "unknown section: ## {heading} (expected one of: {})",
721                            KNOWN_CASE_SECTIONS.join(", ")
722                        ),
723                    });
724                }
725            }
726            continue;
727        }
728
729        match state {
730            State::BeforeTitle => {
731                // Skip blank lines before title
732                if !line.trim().is_empty() {
733                    errors.push(ParseError {
734                        line: file_line,
735                        message: "expected H1 title (# Title)".into(),
736                    });
737                }
738            }
739            State::Summary => {
740                summary_lines.push(line);
741            }
742            State::InSection => {
743                current_section_body.push(line);
744            }
745        }
746    }
747
748    // Flush last section
749    if let Some(kind) = current_section_kind.take() {
750        sections.push(Section {
751            kind,
752            body: current_section_body.join("\n"),
753            line: current_section_line,
754        });
755    }
756
757    // Validate title
758    if !title_found {
759        errors.push(ParseError {
760            line: body_start_line,
761            message: "missing H1 title".into(),
762        });
763    } else if title.len() > MAX_TITLE_LEN {
764        errors.push(ParseError {
765            line: body_start_line,
766            message: format!(
767                "H1 title exceeds {MAX_TITLE_LEN} chars (got {})",
768                title.len()
769            ),
770        });
771    }
772
773    // Build summary (trim leading/trailing blank lines)
774    let summary = summary_lines.clone().join("\n").trim().to_string();
775
776    if summary.len() > MAX_SUMMARY_LEN {
777        errors.push(ParseError {
778            line: body_start_line,
779            message: format!(
780                "summary exceeds {MAX_SUMMARY_LEN} chars (got {})",
781                summary.len()
782            ),
783        });
784    }
785
786    (title, summary, sections)
787}
788
789#[derive(Clone, Copy)]
790enum State {
791    BeforeTitle,
792    Summary,
793    InSection,
794}
795
796/// Strip an ATX heading prefix of the given level. Returns the heading text.
797/// E.g., `strip_heading("## Foo", 2)` returns `Some("Foo")`.
798fn strip_heading(line: &str, level: usize) -> Option<&str> {
799    let prefix = "#".repeat(level);
800    let trimmed = line.trim_start();
801    if trimmed.starts_with(&prefix) {
802        let after = &trimmed[prefix.len()..];
803        // Must be followed by space or end of line, and NOT more `#` chars
804        if after.is_empty() {
805            return Some("");
806        }
807        if after.starts_with(' ') && !after.starts_with(" #") {
808            // Actually, need to exclude `### Foo` when looking for `## Foo`
809            return Some(after[1..].trim());
810        }
811        // Check: `###` should not match `##`
812        if after.starts_with('#') {
813            return None;
814        }
815    }
816    None
817}
818
819#[cfg(test)]
820mod tests {
821    use super::*;
822
823    fn minimal_case() -> String {
824        [
825            "---",
826            "id: 01H9XT7H1J3929RK32FWSRKV88",
827            "sources:",
828            "  - https://example.com/source",
829            "---",
830            "",
831            "# Test Case Title",
832            "",
833            "This is the summary.",
834            "",
835            "## Events",
836            "",
837            "### Something happened",
838            "- occurred_at: 2025-01-01",
839            "",
840            "## Relationships",
841            "",
842            "- Something happened -> Something happened: associate_of",
843        ]
844        .join("\n")
845    }
846
847    #[test]
848    fn parse_minimal_case() {
849        let result = parse(&minimal_case());
850        let case = result.unwrap_or_else(|errs| {
851            panic!(
852                "parse failed: {}",
853                errs.iter()
854                    .map(ToString::to_string)
855                    .collect::<Vec<_>>()
856                    .join("; ")
857            );
858        });
859
860        assert_eq!(case.id.as_deref(), Some("01H9XT7H1J3929RK32FWSRKV88"));
861        assert_eq!(case.sources.len(), 1);
862        assert_eq!(case.sources[0].url(), "https://example.com/source");
863        assert_eq!(case.title, "Test Case Title");
864        assert_eq!(case.summary, "This is the summary.");
865        assert_eq!(case.sections.len(), 2);
866        assert_eq!(case.sections[0].kind, SectionKind::Events);
867        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
868    }
869
870    #[test]
871    fn parse_missing_front_matter() {
872        let input = "# Title\n\nSummary.\n";
873        let errs = parse(input).unwrap_err();
874        assert!(errs.iter().any(|e| e.message.contains("front matter")));
875    }
876
877    #[test]
878    fn parse_unclosed_front_matter() {
879        let input = "---\nsources: []\n# Title\n";
880        let errs = parse(input).unwrap_err();
881        assert!(errs.iter().any(|e| e.message.contains("unclosed")));
882    }
883
884    #[test]
885    fn parse_invalid_case_id_wrong_length() {
886        let input = "---\nid: short\nsources: []\n---\n\n# Title\n";
887        let errs = parse(input).unwrap_err();
888        assert!(errs.iter().any(|e| e.message.contains("NULID")));
889    }
890
891    #[test]
892    fn parse_case_id_absent_is_ok() {
893        let input = "---\nsources:\n  - https://example.com\n---\n\n# Title\n\nSummary.\n";
894        let case = parse(input).unwrap();
895        assert!(case.id.is_none());
896    }
897
898    #[test]
899    fn parse_non_https_source() {
900        let input = "---\nsources:\n  - http://example.com\n---\n\n# Title\n";
901        let errs = parse(input).unwrap_err();
902        assert!(errs.iter().any(|e| e.message.contains("HTTPS")));
903    }
904
905    #[test]
906    fn parse_too_many_sources() {
907        let sources: Vec<String> = (0..21)
908            .map(|i| format!("  - https://example.com/{i}"))
909            .collect();
910        let input = format!("---\nsources:\n{}\n---\n\n# Title\n", sources.join("\n"));
911        let errs = parse(&input).unwrap_err();
912        assert!(errs.iter().any(|e| e.message.contains("exceeds 20")));
913    }
914
915    #[test]
916    fn parse_unknown_section() {
917        let input = [
918            "---",
919            "sources: []",
920            "---",
921            "",
922            "# Title",
923            "",
924            "## Unknown Section",
925            "",
926        ]
927        .join("\n");
928        let errs = parse(&input).unwrap_err();
929        assert!(errs.iter().any(|e| e.message.contains("unknown section")));
930    }
931
932    #[test]
933    fn parse_duplicate_section() {
934        let input = [
935            "---",
936            "sources: []",
937            "---",
938            "",
939            "# Title",
940            "",
941            "## Events",
942            "",
943            "## Events",
944            "",
945        ]
946        .join("\n");
947        let errs = parse(&input).unwrap_err();
948        assert!(errs.iter().any(|e| e.message.contains("duplicate")));
949    }
950
951    #[test]
952    fn parse_multiple_h1() {
953        let input = [
954            "---",
955            "sources: []",
956            "---",
957            "",
958            "# First Title",
959            "",
960            "# Second Title",
961            "",
962        ]
963        .join("\n");
964        let errs = parse(&input).unwrap_err();
965        assert!(errs.iter().any(|e| e.message.contains("multiple H1")));
966    }
967
968    #[test]
969    fn parse_all_sections() {
970        let input = [
971            "---",
972            "id: 01H9XT7H1KRQ9SJ7SD9ETB5CVQ",
973            "sources:",
974            "  - https://example.com/a",
975            "---",
976            "",
977            "# Full Case",
978            "",
979            "Summary text here.",
980            "",
981            "## Events",
982            "",
983            "### Something happened",
984            "- occurred_at: 2025-01-01",
985            "",
986            "## Relationships",
987            "",
988            "- Alice -> Corp Inc: employed_by",
989            "",
990            "## Timeline",
991            "",
992            "Something happened",
993        ]
994        .join("\n");
995
996        let case = parse(&input).unwrap_or_else(|errs| {
997            panic!(
998                "parse failed: {}",
999                errs.iter()
1000                    .map(ToString::to_string)
1001                    .collect::<Vec<_>>()
1002                    .join("; ")
1003            );
1004        });
1005
1006        assert_eq!(case.id.as_deref(), Some("01H9XT7H1KRQ9SJ7SD9ETB5CVQ"));
1007        assert_eq!(case.title, "Full Case");
1008        assert_eq!(case.summary, "Summary text here.");
1009        assert_eq!(case.sections.len(), 3);
1010        assert_eq!(case.sections[0].kind, SectionKind::Events);
1011        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
1012        assert_eq!(case.sections[2].kind, SectionKind::Timeline);
1013    }
1014
1015    #[test]
1016    fn parse_empty_summary() {
1017        let input = [
1018            "---",
1019            "sources: []",
1020            "---",
1021            "",
1022            "# Title",
1023            "",
1024            "## Events",
1025            "",
1026        ]
1027        .join("\n");
1028
1029        let case = parse(&input).unwrap_or_else(|errs| {
1030            panic!(
1031                "parse failed: {}",
1032                errs.iter()
1033                    .map(ToString::to_string)
1034                    .collect::<Vec<_>>()
1035                    .join("; ")
1036            );
1037        });
1038        assert_eq!(case.summary, "");
1039    }
1040
1041    #[test]
1042    fn parse_multiline_summary() {
1043        let input = [
1044            "---",
1045            "sources: []",
1046            "---",
1047            "",
1048            "# Title",
1049            "",
1050            "First line of summary.",
1051            "Second line of summary.",
1052            "",
1053            "## Events",
1054            "",
1055        ]
1056        .join("\n");
1057
1058        let case = parse(&input).unwrap_or_else(|errs| {
1059            panic!(
1060                "parse failed: {}",
1061                errs.iter()
1062                    .map(ToString::to_string)
1063                    .collect::<Vec<_>>()
1064                    .join("; ")
1065            );
1066        });
1067        assert_eq!(
1068            case.summary,
1069            "First line of summary.\nSecond line of summary."
1070        );
1071    }
1072
1073    #[test]
1074    fn strip_heading_levels() {
1075        assert_eq!(strip_heading("# Title", 1), Some("Title"));
1076        assert_eq!(strip_heading("## Section", 2), Some("Section"));
1077        assert_eq!(strip_heading("### Entity", 3), Some("Entity"));
1078        // H3 should not match H2
1079        assert_eq!(strip_heading("### Entity", 2), None);
1080        // H2 should not match H1
1081        assert_eq!(strip_heading("## Section", 1), None);
1082        // Not a heading
1083        assert_eq!(strip_heading("Normal text", 1), None);
1084    }
1085
1086    #[test]
1087    fn section_body_content() {
1088        let input = [
1089            "---",
1090            "sources: []",
1091            "---",
1092            "",
1093            "# Title",
1094            "",
1095            "## Events",
1096            "",
1097            "### Bonnick dismissal",
1098            "- occurred_at: 2024-12-24",
1099            "- type: termination",
1100            "",
1101        ]
1102        .join("\n");
1103
1104        let case = parse(&input).unwrap_or_else(|errs| {
1105            panic!(
1106                "parse failed: {}",
1107                errs.iter()
1108                    .map(ToString::to_string)
1109                    .collect::<Vec<_>>()
1110                    .join("; ")
1111            );
1112        });
1113
1114        assert_eq!(case.sections.len(), 1);
1115        let body = &case.sections[0].body;
1116        assert!(body.contains("### Bonnick dismissal"));
1117        assert!(body.contains("- occurred_at: 2024-12-24"));
1118    }
1119
1120    #[test]
1121    fn parse_rejects_people_section_in_case_file() {
1122        let input = [
1123            "---",
1124            "sources: []",
1125            "---",
1126            "",
1127            "# Title",
1128            "",
1129            "## People",
1130            "",
1131        ]
1132        .join("\n");
1133        let errs = parse(&input).unwrap_err();
1134        assert!(
1135            errs.iter()
1136                .any(|e| e.message.contains("not allowed in case files"))
1137        );
1138    }
1139
1140    #[test]
1141    fn parse_rejects_organizations_section_in_case_file() {
1142        let input = [
1143            "---",
1144            "sources: []",
1145            "---",
1146            "",
1147            "# Title",
1148            "",
1149            "## Organizations",
1150            "",
1151        ]
1152        .join("\n");
1153        let errs = parse(&input).unwrap_err();
1154        assert!(
1155            errs.iter()
1156                .any(|e| e.message.contains("not allowed in case files"))
1157        );
1158    }
1159
1160    #[test]
1161    fn parse_entity_file_with_id() {
1162        let input = [
1163            "---",
1164            "id: 01JXYZ123456789ABCDEFGHIJK",
1165            "---",
1166            "",
1167            "# Mark Bonnick",
1168            "",
1169            "- qualifier: Arsenal Kit Manager",
1170            "- nationality: British",
1171            "",
1172        ]
1173        .join("\n");
1174
1175        let result = parse_entity_file(&input).unwrap();
1176        assert_eq!(result.id.as_deref(), Some("01JXYZ123456789ABCDEFGHIJK"));
1177        assert_eq!(result.name, "Mark Bonnick");
1178        assert!(result.body.contains("- qualifier: Arsenal Kit Manager"));
1179        assert!(result.body.contains("- nationality: British"));
1180    }
1181
1182    #[test]
1183    fn parse_entity_file_without_id() {
1184        let input = [
1185            "---",
1186            "---",
1187            "",
1188            "# Arsenal FC",
1189            "",
1190            "- qualifier: English Football Club",
1191            "- org_type: sports_club",
1192            "",
1193        ]
1194        .join("\n");
1195
1196        let result = parse_entity_file(&input).unwrap();
1197        assert!(result.id.is_none());
1198        assert_eq!(result.name, "Arsenal FC");
1199    }
1200
1201    #[test]
1202    fn parse_entity_file_no_front_matter() {
1203        let input = ["# Bob Smith", "", "- nationality: Dutch", ""].join("\n");
1204
1205        let result = parse_entity_file(&input).unwrap();
1206        assert!(result.id.is_none());
1207        assert_eq!(result.name, "Bob Smith");
1208        assert!(result.body.contains("- nationality: Dutch"));
1209    }
1210
1211    #[test]
1212    fn parse_entity_file_rejects_h2_sections() {
1213        let input = [
1214            "---",
1215            "---",
1216            "",
1217            "# Test Entity",
1218            "",
1219            "## Relationships",
1220            "",
1221        ]
1222        .join("\n");
1223
1224        let errs = parse_entity_file(&input).unwrap_err();
1225        assert!(errs.iter().any(|e| e.message.contains("H2 sections")));
1226    }
1227
1228    #[test]
1229    fn parse_entity_file_missing_h1() {
1230        let input = ["---", "---", "", "- nationality: Dutch", ""].join("\n");
1231
1232        let errs = parse_entity_file(&input).unwrap_err();
1233        assert!(errs.iter().any(|e| e.message.contains("missing H1")));
1234    }
1235
1236    #[test]
1237    fn parse_related_cases_section() {
1238        let input = [
1239            "---",
1240            "tags: [bribery]",
1241            "sources:",
1242            "  - https://example.com",
1243            "---",
1244            "",
1245            "# Test Case",
1246            "",
1247            "Summary text.",
1248            "",
1249            "## Related Cases",
1250            "",
1251            "- id/corruption/2002/blbi-liquidity-aid-scandal",
1252            "  description: Artalyta bribed Urip to influence the BLBI investigation",
1253            "- id/corruption/2008/another-case",
1254            "  description: A second related case",
1255        ]
1256        .join("\n");
1257
1258        let case = parse(&input).unwrap_or_else(|errs| {
1259            panic!(
1260                "parse failed: {}",
1261                errs.iter()
1262                    .map(ToString::to_string)
1263                    .collect::<Vec<_>>()
1264                    .join("; ")
1265            );
1266        });
1267
1268        assert_eq!(case.related_cases.len(), 2);
1269        assert_eq!(
1270            case.related_cases[0].case_path,
1271            "id/corruption/2002/blbi-liquidity-aid-scandal"
1272        );
1273        assert_eq!(
1274            case.related_cases[0].description,
1275            "Artalyta bribed Urip to influence the BLBI investigation"
1276        );
1277        assert_eq!(
1278            case.related_cases[1].case_path,
1279            "id/corruption/2008/another-case"
1280        );
1281        assert_eq!(case.related_cases[1].description, "A second related case");
1282        // RelatedCases should be consumed and NOT appear in sections
1283        assert!(
1284            !case
1285                .sections
1286                .iter()
1287                .any(|s| s.kind == SectionKind::RelatedCases)
1288        );
1289    }
1290
1291    #[test]
1292    fn parse_related_cases_empty_path() {
1293        let input = [
1294            "---",
1295            "sources: []",
1296            "---",
1297            "",
1298            "# Title",
1299            "",
1300            "## Related Cases",
1301            "",
1302            "- ",
1303            "  description: Some description",
1304        ]
1305        .join("\n");
1306
1307        let errs = parse(&input).unwrap_err();
1308        assert!(
1309            errs.iter()
1310                .any(|e| e.message.contains("case path must not be empty"))
1311        );
1312    }
1313
1314    #[test]
1315    fn parse_related_cases_missing_description() {
1316        let input = [
1317            "---",
1318            "sources: []",
1319            "---",
1320            "",
1321            "# Title",
1322            "",
1323            "## Related Cases",
1324            "",
1325            "- id/corruption/2002/some-case",
1326        ]
1327        .join("\n");
1328
1329        let errs = parse(&input).unwrap_err();
1330        assert!(errs.iter().any(|e| e.message.contains("description")));
1331    }
1332
1333    #[test]
1334    fn parse_related_cases_description_too_long() {
1335        let long_desc = "x".repeat(501);
1336        let input = [
1337            "---",
1338            "sources: []",
1339            "---",
1340            "",
1341            "# Title",
1342            "",
1343            "## Related Cases",
1344            "",
1345            "- id/corruption/2002/some-case",
1346            &format!("  description: {long_desc}"),
1347        ]
1348        .join("\n");
1349
1350        let errs = parse(&input).unwrap_err();
1351        assert!(errs.iter().any(|e| e.message.contains("exceeds 500")));
1352    }
1353
1354    #[test]
1355    fn parse_related_cases_too_many() {
1356        let mut lines = vec![
1357            "---".to_string(),
1358            "sources: []".to_string(),
1359            "---".to_string(),
1360            String::new(),
1361            "# Title".to_string(),
1362            String::new(),
1363            "## Related Cases".to_string(),
1364            String::new(),
1365        ];
1366        for i in 0..11 {
1367            lines.push(format!("- id/corruption/2002/case-{i}"));
1368            lines.push(format!("  description: Description {i}"));
1369        }
1370        let input = lines.join("\n");
1371
1372        let errs = parse(&input).unwrap_err();
1373        assert!(errs.iter().any(|e| e.message.contains("exceeds 10")));
1374    }
1375}
weave_content/parser.rs

weave_content/
parser.rs