weave_content/
parser.rs

1#![allow(clippy::module_name_repetitions)]
2
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7/// Maximum length of a case NULID (26 chars Crockford Base32).
8const MAX_CASE_ID_LEN: usize = 26;
9
10/// Maximum number of sources in front matter.
11const MAX_SOURCES: usize = 20;
12
13/// Maximum length of the case title (H1).
14const MAX_TITLE_LEN: usize = 200;
15
16/// Maximum length of the case summary.
17const MAX_SUMMARY_LEN: usize = 2000;
18
19/// Known H2 section names for case files (case-insensitive match).
20/// People and Organizations are no longer allowed in case files -- they
21/// live in standalone entity files under `people/` and `organizations/`.
22const KNOWN_CASE_SECTIONS: &[&str] = &[
23    "Events",
24    "Documents",
25    "Assets",
26    "Relationships",
27    "Timeline",
28    "Related Cases",
29];
30
31/// A parsed case file with front matter, title, summary, and raw sections.
32#[derive(Debug)]
33pub struct ParsedCase {
34    /// NULID for the case node (None if not yet generated).
35    pub id: Option<String>,
36    pub sources: Vec<SourceEntry>,
37    pub title: String,
38    pub summary: String,
39    pub sections: Vec<Section>,
40    /// Case type from front matter (e.g. `corruption`, `fraud`).
41    pub case_type: Option<String>,
42    /// Case status from front matter (e.g. `open`, `trial`).
43    pub status: Option<String>,
44    /// Structured amounts DSL string (e.g. `660000 USD bribe | 250000000 IDR fine`).
45    pub amounts: Option<String>,
46    /// Tags from front matter for categorization.
47    pub tags: Vec<String>,
48    /// Related case entries from `## Related Cases` section.
49    pub related_cases: Vec<RelatedCase>,
50    /// Involved entity entries from `## Involved` section.
51    pub involved: Vec<InvolvedEntry>,
52}
53
54/// A related case entry from `## Related Cases` section.
55#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
56pub struct RelatedCase {
57    /// Case path relative to content root (e.g. `id/corruption/2002/blbi-liquidity-aid-scandal`).
58    pub case_path: String,
59    /// Description of the relationship between the cases.
60    pub description: String,
61    /// NULID for the `related_to` relationship (auto-generated on first build).
62    #[serde(skip_serializing_if = "Option::is_none")]
63    pub id: Option<String>,
64    /// Line number (1-indexed) where this entry appears in the original file.
65    #[serde(skip)]
66    pub line: usize,
67}
68
69/// An entity reference in the `## Involved` section.
70#[derive(Debug, Clone, PartialEq, Eq)]
71pub struct InvolvedEntry {
72    /// Entity name (must match a registry entity referenced in the case).
73    pub entity_name: String,
74    /// NULID for the `involved_in` relationship (auto-generated on first build).
75    pub id: Option<String>,
76    /// Line number (1-indexed) where this entry appears in the original file.
77    pub line: usize,
78}
79
80/// A raw H2 section with its heading text and body content.
81#[derive(Debug)]
82pub struct Section {
83    pub kind: SectionKind,
84    pub body: String,
85    /// Line number (1-indexed) where the H2 heading appears in the original file.
86    pub line: usize,
87}
88
89/// The type of an H2 section, mapped from heading text.
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum SectionKind {
92    People,
93    Organizations,
94    Events,
95    Documents,
96    Assets,
97    Relationships,
98    Timeline,
99    RelatedCases,
100    Involved,
101}
102
103impl SectionKind {
104    fn from_heading(heading: &str) -> Option<Self> {
105        match heading.trim() {
106            s if s.eq_ignore_ascii_case("People") => Some(Self::People),
107            s if s.eq_ignore_ascii_case("Organizations") => Some(Self::Organizations),
108            s if s.eq_ignore_ascii_case("Events") => Some(Self::Events),
109            s if s.eq_ignore_ascii_case("Documents") => Some(Self::Documents),
110            s if s.eq_ignore_ascii_case("Assets") => Some(Self::Assets),
111            s if s.eq_ignore_ascii_case("Relationships") => Some(Self::Relationships),
112            s if s.eq_ignore_ascii_case("Timeline") => Some(Self::Timeline),
113            s if s.eq_ignore_ascii_case("Related Cases") => Some(Self::RelatedCases),
114            s if s.eq_ignore_ascii_case("Involved") => Some(Self::Involved),
115            _ => None,
116        }
117    }
118
119    /// Whether this section kind is valid in case files.
120    /// People and Organizations are no longer allowed in case files.
121    pub fn is_case_section(self) -> bool {
122        matches!(
123            self,
124            Self::Events
125                | Self::Documents
126                | Self::Assets
127                | Self::Relationships
128                | Self::Timeline
129                | Self::RelatedCases
130                | Self::Involved
131        )
132    }
133}
134
135/// A parser error with file location.
136#[derive(Debug)]
137pub struct ParseError {
138    pub line: usize,
139    pub message: String,
140}
141
142impl fmt::Display for ParseError {
143    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
144        write!(f, "line {}: {}", self.line, self.message)
145    }
146}
147
148/// Maximum number of tags per case file.
149const MAX_CASE_TAGS: usize = 10;
150
151/// Maximum number of tags per entity file.
152const MAX_ENTITY_TAGS: usize = 5;
153
154/// Maximum length of a single tag.
155const MAX_TAG_LEN: usize = 50;
156
157/// Maximum number of related case entries per case file.
158const MAX_RELATED_CASES: usize = 10;
159
160/// Maximum length of a related case description.
161const MAX_RELATED_DESCRIPTION_LEN: usize = 500;
162
163/// Parse the body of a `## Related Cases` section into `RelatedCase` entries.
164///
165/// Each entry is a bullet `- <case_path>` followed by indented fields:
166/// `description: <text>` (required) and `id: <NULID>` (optional, written back).
167pub fn parse_related_cases(
168    body: &str,
169    section_start_line: usize,
170    errors: &mut Vec<ParseError>,
171) -> Vec<RelatedCase> {
172    let mut entries: Vec<(String, String, Option<String>, usize)> = Vec::new(); // (path, desc, id, line)
173
174    for (offset, line) in body.lines().enumerate() {
175        let file_line = section_start_line + offset + 1;
176
177        if let Some(rest) = line.strip_prefix("- ") {
178            let case_path = rest.trim().to_string();
179            entries.push((case_path, String::new(), None, file_line));
180        } else if let Some(rest) = line.strip_prefix("  description: ") {
181            if let Some(entry) = entries.last_mut() {
182                entry.1 = rest.trim().to_string();
183            } else {
184                errors.push(ParseError {
185                    line: file_line,
186                    message: "description without a preceding case path".into(),
187                });
188            }
189        } else if let Some(rest) = line.strip_prefix("  id: ") {
190            if let Some(entry) = entries.last_mut() {
191                entry.2 = Some(rest.trim().to_string());
192            } else {
193                errors.push(ParseError {
194                    line: file_line,
195                    message: "id without a preceding case path".into(),
196                });
197            }
198        } else if !line.trim().is_empty() {
199            errors.push(ParseError {
200                line: file_line,
201                message: format!("unexpected line in Related Cases: {line}"),
202            });
203        }
204    }
205
206    if entries.len() > MAX_RELATED_CASES {
207        errors.push(ParseError {
208            line: section_start_line,
209            message: format!(
210                "Related Cases exceeds {MAX_RELATED_CASES} entries (got {})",
211                entries.len()
212            ),
213        });
214    }
215
216    let mut result = Vec::new();
217    for (case_path, description, id, line) in entries {
218        if case_path.is_empty() {
219            errors.push(ParseError {
220                line,
221                message: "related case path must not be empty".into(),
222            });
223            continue;
224        }
225        if description.is_empty() {
226            errors.push(ParseError {
227                line,
228                message: format!("related case {case_path:?} missing description"),
229            });
230            continue;
231        }
232        if description.len() > MAX_RELATED_DESCRIPTION_LEN {
233            errors.push(ParseError {
234                line,
235                message: format!(
236                    "related case description exceeds {MAX_RELATED_DESCRIPTION_LEN} chars (got {})",
237                    description.len()
238                ),
239            });
240            continue;
241        }
242        result.push(RelatedCase {
243            case_path,
244            description,
245            id,
246            line,
247        });
248    }
249
250    result
251}
252
253/// Maximum number of entries in `## Involved` section.
254const MAX_INVOLVED: usize = 50;
255
256/// Parse the body of a `## Involved` section into `InvolvedEntry` items.
257///
258/// Format:
259/// ```text
260/// - Entity Name
261///   id: 01ABC...
262/// ```
263pub fn parse_involved(
264    body: &str,
265    section_start_line: usize,
266    errors: &mut Vec<ParseError>,
267) -> Vec<InvolvedEntry> {
268    let mut entries = Vec::new();
269    let lines: Vec<&str> = body.lines().collect();
270
271    let mut i = 0;
272    while i < lines.len() {
273        let file_line = section_start_line + 1 + i;
274        let trimmed = lines[i].trim();
275
276        if trimmed.is_empty() {
277            i += 1;
278            continue;
279        }
280
281        let Some(name) = trimmed.strip_prefix("- ") else {
282            errors.push(ParseError {
283                line: file_line,
284                message: format!("expected involved entry `- Entity Name`, got {trimmed:?}"),
285            });
286            i += 1;
287            continue;
288        };
289
290        let entity_name = name.trim().to_string();
291        if entity_name.is_empty() {
292            errors.push(ParseError {
293                line: file_line,
294                message: "involved entity name must not be empty".into(),
295            });
296            i += 1;
297            continue;
298        }
299
300        // Look ahead for `id:` on the next line
301        let mut id: Option<String> = None;
302        if i + 1 < lines.len() {
303            let next = lines[i + 1].trim();
304            if let Some(id_val) = next.strip_prefix("id: ") {
305                id = Some(id_val.trim().to_string());
306                i += 1;
307            }
308        }
309
310        entries.push(InvolvedEntry {
311            entity_name,
312            id,
313            line: file_line,
314        });
315
316        i += 1;
317    }
318
319    if entries.len() > MAX_INVOLVED {
320        errors.push(ParseError {
321            line: section_start_line,
322            message: format!(
323                "Involved exceeds {MAX_INVOLVED} entries (got {})",
324                entries.len()
325            ),
326        });
327    }
328
329    entries
330}
331
332/// YAML front matter schema.
333#[derive(Deserialize)]
334struct FrontMatter {
335    /// NULID for the case node (auto-generated on first build).
336    #[serde(default)]
337    id: Option<String>,
338    #[serde(default)]
339    sources: Vec<SourceEntry>,
340    #[serde(default)]
341    case_type: Option<String>,
342    #[serde(default)]
343    status: Option<String>,
344    #[serde(default)]
345    amounts: Option<String>,
346    #[serde(default)]
347    tags: Vec<String>,
348}
349
350/// A source entry in front matter. Supports both bare URL strings and
351/// structured objects with metadata.
352#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
353#[serde(untagged)]
354pub enum SourceEntry {
355    /// Plain URL string (backward-compatible).
356    Url(String),
357    /// Structured source with metadata.
358    Structured {
359        url: String,
360        #[serde(default)]
361        title: Option<String>,
362        #[serde(default)]
363        published_at: Option<String>,
364        #[serde(default)]
365        language: Option<String>,
366    },
367}
368
369impl SourceEntry {
370    /// Get the URL from any source entry variant.
371    pub fn url(&self) -> &str {
372        match self {
373            Self::Url(u) => u,
374            Self::Structured { url, .. } => url,
375        }
376    }
377}
378
379/// YAML front matter schema for standalone entity files.
380/// Only contains an optional `id` field (NULID, generated on first build).
381#[derive(Deserialize)]
382struct EntityFrontMatter {
383    #[serde(default)]
384    id: Option<String>,
385    #[serde(default)]
386    tags: Vec<String>,
387}
388
389/// A parsed standalone entity file (actor or institution).
390#[derive(Debug)]
391pub struct ParsedEntityFile {
392    /// Stored NULID from front matter (None if not yet generated).
393    pub id: Option<String>,
394    /// Entity name from H1 heading.
395    pub name: String,
396    /// Raw bullet field lines (body after H1, no sections).
397    pub body: String,
398    /// Line number of the H1 heading in the original file.
399    pub title_line: usize,
400    /// Tags from front matter.
401    pub tags: Vec<String>,
402}
403
404/// Parse a Markdown case file into a `ParsedCase`.
405///
406/// Extracts YAML front matter, H1 title, summary, and H2 sections.
407/// Returns errors for malformed structure or boundary violations.
408pub fn parse(input: &str) -> Result<ParsedCase, Vec<ParseError>> {
409    let mut errors = Vec::new();
410
411    // Extract front matter
412    let (front_matter, body_start_line, body) = extract_front_matter(input, &mut errors);
413
414    let Some(front_matter) = front_matter else {
415        if errors.is_empty() {
416            errors.push(ParseError {
417                line: 1,
418                message: "missing YAML front matter (expected `---` delimiter)".into(),
419            });
420        }
421        return Err(errors);
422    };
423
424    // Validate front matter fields
425    validate_front_matter(&front_matter, &mut errors);
426
427    // Extract title, summary, and sections from body
428    let (title, summary, mut sections) = extract_body(&body, body_start_line, &mut errors);
429
430    // Parse Related Cases sections
431    let mut related_cases = Vec::new();
432    for section in &sections {
433        if section.kind == SectionKind::RelatedCases {
434            let entries = parse_related_cases(&section.body, section.line, &mut errors);
435            related_cases.extend(entries);
436        }
437    }
438    // Remove RelatedCases from sections list (consumed)
439    sections.retain(|s| s.kind != SectionKind::RelatedCases);
440
441    // Parse Involved sections
442    let mut involved = Vec::new();
443    for section in &sections {
444        if section.kind == SectionKind::Involved {
445            let entries = parse_involved(&section.body, section.line, &mut errors);
446            involved.extend(entries);
447        }
448    }
449    // Remove Involved from sections list (consumed)
450    sections.retain(|s| s.kind != SectionKind::Involved);
451
452    if !errors.is_empty() {
453        return Err(errors);
454    }
455
456    Ok(ParsedCase {
457        id: front_matter.id,
458        sources: front_matter.sources,
459        title,
460        summary,
461        sections,
462        case_type: front_matter.case_type,
463        status: front_matter.status,
464        amounts: front_matter.amounts,
465        tags: front_matter.tags,
466        related_cases,
467        involved,
468    })
469}
470
471/// Parse a standalone entity file (actor or institution).
472///
473/// Entity files have YAML front matter with optional `id:`, an H1 name,
474/// and bullet fields directly in the body. No H2 sections are allowed.
475pub fn parse_entity_file(input: &str) -> Result<ParsedEntityFile, Vec<ParseError>> {
476    let mut errors = Vec::new();
477
478    let (front_matter, body_start_line, body) = extract_entity_front_matter(input, &mut errors);
479
480    let id = front_matter.as_ref().and_then(|fm| fm.id.clone());
481    let tags = front_matter.map_or_else(Vec::new, |fm| fm.tags);
482
483    // Validate entity tags
484    if tags.len() > MAX_ENTITY_TAGS {
485        errors.push(ParseError {
486            line: 2,
487            message: format!(
488                "front matter `tags` exceeds {MAX_ENTITY_TAGS} entries (got {})",
489                tags.len()
490            ),
491        });
492    }
493    for (i, tag) in tags.iter().enumerate() {
494        if tag.len() > MAX_TAG_LEN {
495            errors.push(ParseError {
496                line: 2,
497                message: format!("front matter tag #{} exceeds {MAX_TAG_LEN} chars", i + 1),
498            });
499        }
500        if tag.is_empty() {
501            errors.push(ParseError {
502                line: 2,
503                message: format!("front matter tag #{} is empty", i + 1),
504            });
505        }
506    }
507
508    // Extract H1 title and body content (no sections allowed)
509    let (name, title_line, field_body) = extract_entity_body(&body, body_start_line, &mut errors);
510
511    if !errors.is_empty() {
512        return Err(errors);
513    }
514
515    Ok(ParsedEntityFile {
516        id,
517        name,
518        body: field_body,
519        title_line,
520        tags,
521    })
522}
523
524/// Extract YAML front matter for entity files.
525/// Front matter is optional for entity files -- if absent, returns None with no error.
526fn extract_entity_front_matter(
527    input: &str,
528    errors: &mut Vec<ParseError>,
529) -> (Option<EntityFrontMatter>, usize, String) {
530    let lines: Vec<&str> = input.lines().collect();
531
532    let first_delim = lines.iter().position(|l| l.trim() == "---");
533    if first_delim != Some(0) {
534        // No front matter -- entire file is body, starting at line 1
535        return (None, 1, input.to_string());
536    }
537
538    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
539    let Some(close_offset) = close_delim else {
540        errors.push(ParseError {
541            line: 1,
542            message: "unclosed YAML front matter (missing closing `---`)".into(),
543        });
544        return (None, 1, String::new());
545    };
546
547    let close_line = close_offset + 1;
548    let yaml_str: String = lines[1..close_line].join("\n");
549    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
550    let body = lines[close_line + 1..].join("\n");
551
552    match serde_yaml::from_str::<EntityFrontMatter>(&yaml_str) {
553        Ok(fm) => (Some(fm), body_start_line, body),
554        Err(e) => {
555            errors.push(ParseError {
556                line: 2,
557                message: format!("invalid YAML front matter: {e}"),
558            });
559            (None, body_start_line, body)
560        }
561    }
562}
563
564/// Extract H1 name and field body from an entity file.
565/// Rejects any H2 sections.
566fn extract_entity_body(
567    body: &str,
568    body_start_line: usize,
569    errors: &mut Vec<ParseError>,
570) -> (String, usize, String) {
571    let lines: Vec<&str> = body.lines().collect();
572    let mut name = String::new();
573    let mut title_found = false;
574    let mut title_line = body_start_line;
575    let mut field_lines: Vec<&str> = Vec::new();
576
577    for (i, line) in lines.iter().enumerate() {
578        let file_line = body_start_line + i;
579
580        if let Some(heading) = strip_heading(line, 1) {
581            if title_found {
582                errors.push(ParseError {
583                    line: file_line,
584                    message: "multiple H1 headings found (expected exactly one)".into(),
585                });
586                continue;
587            }
588            name = heading.to_string();
589            title_found = true;
590            title_line = file_line;
591            continue;
592        }
593
594        // Reject H2 sections in entity files
595        if strip_heading(line, 2).is_some() {
596            errors.push(ParseError {
597                line: file_line,
598                message: "H2 sections are not allowed in entity files".into(),
599            });
600            continue;
601        }
602
603        if title_found {
604            field_lines.push(line);
605        } else if !line.trim().is_empty() {
606            errors.push(ParseError {
607                line: file_line,
608                message: "expected H1 heading (# Name)".into(),
609            });
610        }
611    }
612
613    if !title_found {
614        errors.push(ParseError {
615            line: body_start_line,
616            message: "missing H1 heading".into(),
617        });
618    } else if name.len() > MAX_TITLE_LEN {
619        errors.push(ParseError {
620            line: title_line,
621            message: format!("H1 name exceeds {MAX_TITLE_LEN} chars (got {})", name.len()),
622        });
623    }
624
625    (name, title_line, field_lines.join("\n"))
626}
627
628/// Extract YAML front matter delimited by `---` lines.
629/// Returns the parsed front matter, the line number where the body starts,
630/// and the body text.
631fn extract_front_matter(
632    input: &str,
633    errors: &mut Vec<ParseError>,
634) -> (Option<FrontMatter>, usize, String) {
635    let lines: Vec<&str> = input.lines().collect();
636
637    // First non-empty line must be `---`
638    let first_delim = lines.iter().position(|l| l.trim() == "---");
639    if first_delim != Some(0) {
640        errors.push(ParseError {
641            line: 1,
642            message: "missing YAML front matter (expected `---` on first line)".into(),
643        });
644        return (None, 1, input.to_string());
645    }
646
647    // Find closing `---`
648    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
649    let Some(close_offset) = close_delim else {
650        errors.push(ParseError {
651            line: 1,
652            message: "unclosed YAML front matter (missing closing `---`)".into(),
653        });
654        return (None, 1, String::new());
655    };
656
657    let close_line = close_offset + 1; // index in `lines`
658    let yaml_str: String = lines[1..close_line].join("\n");
659    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
660    let body = lines[close_line + 1..].join("\n");
661
662    match serde_yaml::from_str::<FrontMatter>(&yaml_str) {
663        Ok(fm) => (Some(fm), body_start_line, body),
664        Err(e) => {
665            errors.push(ParseError {
666                line: 2,
667                message: format!("invalid YAML front matter: {e}"),
668            });
669            (None, body_start_line, body)
670        }
671    }
672}
673
674fn validate_front_matter(fm: &FrontMatter, errors: &mut Vec<ParseError>) {
675    // Validate case ID (NULID) if present
676    if let Some(id) = &fm.id
677        && id.len() != MAX_CASE_ID_LEN {
678            errors.push(ParseError {
679                line: 2,
680                message: format!(
681                    "front matter `id` must be a {MAX_CASE_ID_LEN}-char NULID, got {} chars",
682                    id.len()
683                ),
684            });
685        }
686
687    // Validate sources count
688    if fm.sources.len() > MAX_SOURCES {
689        errors.push(ParseError {
690            line: 2,
691            message: format!(
692                "front matter `sources` exceeds {MAX_SOURCES} entries (got {})",
693                fm.sources.len()
694            ),
695        });
696    }
697
698    // Validate each source URL is HTTPS
699    for (i, source) in fm.sources.iter().enumerate() {
700        if !source.url().starts_with("https://") {
701            errors.push(ParseError {
702                line: 2,
703                message: format!("source[{i}] must be HTTPS, got {:?}", source.url()),
704            });
705        }
706    }
707
708    // Validate case_type
709    if let Some(ct) = &fm.case_type {
710        use crate::domain::CaseType;
711        let normalized = ct.to_lowercase().replace(' ', "_");
712        if !CaseType::KNOWN.contains(&normalized.as_str())
713            && crate::domain::parse_custom(ct).is_none()
714        {
715            errors.push(ParseError {
716                line: 2,
717                message: format!(
718                    "invalid case_type {:?} (known: {}; use \"custom:Value\" for custom)",
719                    ct,
720                    CaseType::KNOWN.join(", ")
721                ),
722            });
723        }
724    }
725
726    // Validate status
727    if let Some(st) = &fm.status {
728        use crate::domain::CaseStatus;
729        let normalized = st.to_lowercase().replace(' ', "_");
730        if !CaseStatus::KNOWN.contains(&normalized.as_str()) {
731            errors.push(ParseError {
732                line: 2,
733                message: format!(
734                    "invalid status {:?} (known: {})",
735                    st,
736                    CaseStatus::KNOWN.join(", ")
737                ),
738            });
739        }
740    }
741
742    // Validate tags
743    if fm.tags.len() > MAX_CASE_TAGS {
744        errors.push(ParseError {
745            line: 2,
746            message: format!(
747                "front matter `tags` exceeds {MAX_CASE_TAGS} entries (got {})",
748                fm.tags.len()
749            ),
750        });
751    }
752    for (i, tag) in fm.tags.iter().enumerate() {
753        if tag.len() > MAX_TAG_LEN {
754            errors.push(ParseError {
755                line: 2,
756                message: format!("tag[{i}] exceeds {MAX_TAG_LEN} chars (got {})", tag.len()),
757            });
758        }
759        if tag.is_empty() {
760            errors.push(ParseError {
761                line: 2,
762                message: format!("tag[{i}] must not be empty"),
763            });
764        }
765    }
766}
767
768/// Extract the H1 title, summary text, and H2 sections from the body.
769#[allow(clippy::too_many_lines)]
770fn extract_body(
771    body: &str,
772    body_start_line: usize,
773    errors: &mut Vec<ParseError>,
774) -> (String, String, Vec<Section>) {
775    let lines: Vec<&str> = body.lines().collect();
776    let mut title = String::new();
777    let mut title_found = false;
778    let mut summary_lines: Vec<&str> = Vec::new();
779    let mut sections: Vec<Section> = Vec::new();
780
781    // Track current H2 section being built
782    let mut current_section_kind: Option<SectionKind> = None;
783    let mut current_section_line: usize = 0;
784    let mut current_section_body: Vec<&str> = Vec::new();
785
786    // State: before H1, after H1 (summary), in sections
787    let mut state = State::BeforeTitle;
788
789    for (i, line) in lines.iter().enumerate() {
790        let file_line = body_start_line + i; // 1-indexed line in original file
791
792        if let Some(heading) = strip_heading(line, 1) {
793            if title_found {
794                errors.push(ParseError {
795                    line: file_line,
796                    message: "multiple H1 headings found (expected exactly one)".into(),
797                });
798                continue;
799            }
800            title = heading.to_string();
801            title_found = true;
802            state = State::Summary;
803            continue;
804        }
805
806        if let Some(heading) = strip_heading(line, 2) {
807            // Flush previous section
808            if let Some(kind) = current_section_kind.take() {
809                sections.push(Section {
810                    kind,
811                    body: current_section_body.join("\n"),
812                    line: current_section_line,
813                });
814                current_section_body.clear();
815            }
816
817            match SectionKind::from_heading(heading) {
818                Some(kind) if kind.is_case_section() => {
819                    // Check for duplicate sections
820                    if sections.iter().any(|s| s.kind == kind) {
821                        errors.push(ParseError {
822                            line: file_line,
823                            message: format!("duplicate section: ## {heading}"),
824                        });
825                    }
826                    current_section_kind = Some(kind);
827                    current_section_line = file_line;
828                    state = State::InSection;
829                }
830                Some(_) => {
831                    // Legacy section (People/Organizations) -- not allowed in case files
832                    errors.push(ParseError {
833                        line: file_line,
834                        message: format!(
835                            "## {heading} is not allowed in case files (use standalone entity files in people/ or organizations/ instead)"
836                        ),
837                    });
838                }
839                None => {
840                    errors.push(ParseError {
841                        line: file_line,
842                        message: format!(
843                            "unknown section: ## {heading} (expected one of: {})",
844                            KNOWN_CASE_SECTIONS.join(", ")
845                        ),
846                    });
847                }
848            }
849            continue;
850        }
851
852        match state {
853            State::BeforeTitle => {
854                // Skip blank lines before title
855                if !line.trim().is_empty() {
856                    errors.push(ParseError {
857                        line: file_line,
858                        message: "expected H1 title (# Title)".into(),
859                    });
860                }
861            }
862            State::Summary => {
863                summary_lines.push(line);
864            }
865            State::InSection => {
866                current_section_body.push(line);
867            }
868        }
869    }
870
871    // Flush last section
872    if let Some(kind) = current_section_kind.take() {
873        sections.push(Section {
874            kind,
875            body: current_section_body.join("\n"),
876            line: current_section_line,
877        });
878    }
879
880    // Validate title
881    if !title_found {
882        errors.push(ParseError {
883            line: body_start_line,
884            message: "missing H1 title".into(),
885        });
886    } else if title.len() > MAX_TITLE_LEN {
887        errors.push(ParseError {
888            line: body_start_line,
889            message: format!(
890                "H1 title exceeds {MAX_TITLE_LEN} chars (got {})",
891                title.len()
892            ),
893        });
894    }
895
896    // Build summary (trim leading/trailing blank lines)
897    let summary = summary_lines.clone().join("\n").trim().to_string();
898
899    if summary.len() > MAX_SUMMARY_LEN {
900        errors.push(ParseError {
901            line: body_start_line,
902            message: format!(
903                "summary exceeds {MAX_SUMMARY_LEN} chars (got {})",
904                summary.len()
905            ),
906        });
907    }
908
909    (title, summary, sections)
910}
911
912#[derive(Clone, Copy)]
913enum State {
914    BeforeTitle,
915    Summary,
916    InSection,
917}
918
919/// Strip an ATX heading prefix of the given level. Returns the heading text.
920/// E.g., `strip_heading("## Foo", 2)` returns `Some("Foo")`.
921fn strip_heading(line: &str, level: usize) -> Option<&str> {
922    let prefix = "#".repeat(level);
923    let trimmed = line.trim_start();
924    if trimmed.starts_with(&prefix) {
925        let after = &trimmed[prefix.len()..];
926        // Must be followed by space or end of line, and NOT more `#` chars
927        if after.is_empty() {
928            return Some("");
929        }
930        if after.starts_with(' ') && !after.starts_with(" #") {
931            // Actually, need to exclude `### Foo` when looking for `## Foo`
932            return Some(after[1..].trim());
933        }
934        // Check: `###` should not match `##`
935        if after.starts_with('#') {
936            return None;
937        }
938    }
939    None
940}
941
942#[cfg(test)]
943mod tests {
944    use super::*;
945
946    fn minimal_case() -> String {
947        [
948            "---",
949            "id: 01H9XT7H1J3929RK32FWSRKV88",
950            "sources:",
951            "  - https://example.com/source",
952            "---",
953            "",
954            "# Test Case Title",
955            "",
956            "This is the summary.",
957            "",
958            "## Events",
959            "",
960            "### Something happened",
961            "- occurred_at: 2025-01-01",
962            "",
963            "## Relationships",
964            "",
965            "- Something happened -> Something happened: associate_of",
966        ]
967        .join("\n")
968    }
969
970    #[test]
971    fn parse_minimal_case() {
972        let result = parse(&minimal_case());
973        let case = result.unwrap_or_else(|errs| {
974            panic!(
975                "parse failed: {}",
976                errs.iter()
977                    .map(ToString::to_string)
978                    .collect::<Vec<_>>()
979                    .join("; ")
980            );
981        });
982
983        assert_eq!(case.id.as_deref(), Some("01H9XT7H1J3929RK32FWSRKV88"));
984        assert_eq!(case.sources.len(), 1);
985        assert_eq!(case.sources[0].url(), "https://example.com/source");
986        assert_eq!(case.title, "Test Case Title");
987        assert_eq!(case.summary, "This is the summary.");
988        assert_eq!(case.sections.len(), 2);
989        assert_eq!(case.sections[0].kind, SectionKind::Events);
990        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
991    }
992
993    #[test]
994    fn parse_missing_front_matter() {
995        let input = "# Title\n\nSummary.\n";
996        let errs = parse(input).unwrap_err();
997        assert!(errs.iter().any(|e| e.message.contains("front matter")));
998    }
999
1000    #[test]
1001    fn parse_unclosed_front_matter() {
1002        let input = "---\nsources: []\n# Title\n";
1003        let errs = parse(input).unwrap_err();
1004        assert!(errs.iter().any(|e| e.message.contains("unclosed")));
1005    }
1006
1007    #[test]
1008    fn parse_invalid_case_id_wrong_length() {
1009        let input = "---\nid: short\nsources: []\n---\n\n# Title\n";
1010        let errs = parse(input).unwrap_err();
1011        assert!(errs.iter().any(|e| e.message.contains("NULID")));
1012    }
1013
1014    #[test]
1015    fn parse_case_id_absent_is_ok() {
1016        let input = "---\nsources:\n  - https://example.com\n---\n\n# Title\n\nSummary.\n";
1017        let case = parse(input).unwrap();
1018        assert!(case.id.is_none());
1019    }
1020
1021    #[test]
1022    fn parse_non_https_source() {
1023        let input = "---\nsources:\n  - http://example.com\n---\n\n# Title\n";
1024        let errs = parse(input).unwrap_err();
1025        assert!(errs.iter().any(|e| e.message.contains("HTTPS")));
1026    }
1027
1028    #[test]
1029    fn parse_too_many_sources() {
1030        let sources: Vec<String> = (0..21)
1031            .map(|i| format!("  - https://example.com/{i}"))
1032            .collect();
1033        let input = format!("---\nsources:\n{}\n---\n\n# Title\n", sources.join("\n"));
1034        let errs = parse(&input).unwrap_err();
1035        assert!(errs.iter().any(|e| e.message.contains("exceeds 20")));
1036    }
1037
1038    #[test]
1039    fn parse_unknown_section() {
1040        let input = [
1041            "---",
1042            "sources: []",
1043            "---",
1044            "",
1045            "# Title",
1046            "",
1047            "## Unknown Section",
1048            "",
1049        ]
1050        .join("\n");
1051        let errs = parse(&input).unwrap_err();
1052        assert!(errs.iter().any(|e| e.message.contains("unknown section")));
1053    }
1054
1055    #[test]
1056    fn parse_duplicate_section() {
1057        let input = [
1058            "---",
1059            "sources: []",
1060            "---",
1061            "",
1062            "# Title",
1063            "",
1064            "## Events",
1065            "",
1066            "## Events",
1067            "",
1068        ]
1069        .join("\n");
1070        let errs = parse(&input).unwrap_err();
1071        assert!(errs.iter().any(|e| e.message.contains("duplicate")));
1072    }
1073
1074    #[test]
1075    fn parse_multiple_h1() {
1076        let input = [
1077            "---",
1078            "sources: []",
1079            "---",
1080            "",
1081            "# First Title",
1082            "",
1083            "# Second Title",
1084            "",
1085        ]
1086        .join("\n");
1087        let errs = parse(&input).unwrap_err();
1088        assert!(errs.iter().any(|e| e.message.contains("multiple H1")));
1089    }
1090
1091    #[test]
1092    fn parse_all_sections() {
1093        let input = [
1094            "---",
1095            "id: 01H9XT7H1KRQ9SJ7SD9ETB5CVQ",
1096            "sources:",
1097            "  - https://example.com/a",
1098            "---",
1099            "",
1100            "# Full Case",
1101            "",
1102            "Summary text here.",
1103            "",
1104            "## Events",
1105            "",
1106            "### Something happened",
1107            "- occurred_at: 2025-01-01",
1108            "",
1109            "## Relationships",
1110            "",
1111            "- Alice -> Corp Inc: employed_by",
1112            "",
1113            "## Timeline",
1114            "",
1115            "Something happened",
1116        ]
1117        .join("\n");
1118
1119        let case = parse(&input).unwrap_or_else(|errs| {
1120            panic!(
1121                "parse failed: {}",
1122                errs.iter()
1123                    .map(ToString::to_string)
1124                    .collect::<Vec<_>>()
1125                    .join("; ")
1126            );
1127        });
1128
1129        assert_eq!(case.id.as_deref(), Some("01H9XT7H1KRQ9SJ7SD9ETB5CVQ"));
1130        assert_eq!(case.title, "Full Case");
1131        assert_eq!(case.summary, "Summary text here.");
1132        assert_eq!(case.sections.len(), 3);
1133        assert_eq!(case.sections[0].kind, SectionKind::Events);
1134        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
1135        assert_eq!(case.sections[2].kind, SectionKind::Timeline);
1136    }
1137
1138    #[test]
1139    fn parse_empty_summary() {
1140        let input = [
1141            "---",
1142            "sources: []",
1143            "---",
1144            "",
1145            "# Title",
1146            "",
1147            "## Events",
1148            "",
1149        ]
1150        .join("\n");
1151
1152        let case = parse(&input).unwrap_or_else(|errs| {
1153            panic!(
1154                "parse failed: {}",
1155                errs.iter()
1156                    .map(ToString::to_string)
1157                    .collect::<Vec<_>>()
1158                    .join("; ")
1159            );
1160        });
1161        assert_eq!(case.summary, "");
1162    }
1163
1164    #[test]
1165    fn parse_multiline_summary() {
1166        let input = [
1167            "---",
1168            "sources: []",
1169            "---",
1170            "",
1171            "# Title",
1172            "",
1173            "First line of summary.",
1174            "Second line of summary.",
1175            "",
1176            "## Events",
1177            "",
1178        ]
1179        .join("\n");
1180
1181        let case = parse(&input).unwrap_or_else(|errs| {
1182            panic!(
1183                "parse failed: {}",
1184                errs.iter()
1185                    .map(ToString::to_string)
1186                    .collect::<Vec<_>>()
1187                    .join("; ")
1188            );
1189        });
1190        assert_eq!(
1191            case.summary,
1192            "First line of summary.\nSecond line of summary."
1193        );
1194    }
1195
1196    #[test]
1197    fn strip_heading_levels() {
1198        assert_eq!(strip_heading("# Title", 1), Some("Title"));
1199        assert_eq!(strip_heading("## Section", 2), Some("Section"));
1200        assert_eq!(strip_heading("### Entity", 3), Some("Entity"));
1201        // H3 should not match H2
1202        assert_eq!(strip_heading("### Entity", 2), None);
1203        // H2 should not match H1
1204        assert_eq!(strip_heading("## Section", 1), None);
1205        // Not a heading
1206        assert_eq!(strip_heading("Normal text", 1), None);
1207    }
1208
1209    #[test]
1210    fn section_body_content() {
1211        let input = [
1212            "---",
1213            "sources: []",
1214            "---",
1215            "",
1216            "# Title",
1217            "",
1218            "## Events",
1219            "",
1220            "### Bonnick dismissal",
1221            "- occurred_at: 2024-12-24",
1222            "- type: termination",
1223            "",
1224        ]
1225        .join("\n");
1226
1227        let case = parse(&input).unwrap_or_else(|errs| {
1228            panic!(
1229                "parse failed: {}",
1230                errs.iter()
1231                    .map(ToString::to_string)
1232                    .collect::<Vec<_>>()
1233                    .join("; ")
1234            );
1235        });
1236
1237        assert_eq!(case.sections.len(), 1);
1238        let body = &case.sections[0].body;
1239        assert!(body.contains("### Bonnick dismissal"));
1240        assert!(body.contains("- occurred_at: 2024-12-24"));
1241    }
1242
1243    #[test]
1244    fn parse_rejects_people_section_in_case_file() {
1245        let input = [
1246            "---",
1247            "sources: []",
1248            "---",
1249            "",
1250            "# Title",
1251            "",
1252            "## People",
1253            "",
1254        ]
1255        .join("\n");
1256        let errs = parse(&input).unwrap_err();
1257        assert!(
1258            errs.iter()
1259                .any(|e| e.message.contains("not allowed in case files"))
1260        );
1261    }
1262
1263    #[test]
1264    fn parse_rejects_organizations_section_in_case_file() {
1265        let input = [
1266            "---",
1267            "sources: []",
1268            "---",
1269            "",
1270            "# Title",
1271            "",
1272            "## Organizations",
1273            "",
1274        ]
1275        .join("\n");
1276        let errs = parse(&input).unwrap_err();
1277        assert!(
1278            errs.iter()
1279                .any(|e| e.message.contains("not allowed in case files"))
1280        );
1281    }
1282
1283    #[test]
1284    fn parse_entity_file_with_id() {
1285        let input = [
1286            "---",
1287            "id: 01JXYZ123456789ABCDEFGHIJK",
1288            "---",
1289            "",
1290            "# Mark Bonnick",
1291            "",
1292            "- qualifier: Arsenal Kit Manager",
1293            "- nationality: British",
1294            "",
1295        ]
1296        .join("\n");
1297
1298        let result = parse_entity_file(&input).unwrap();
1299        assert_eq!(result.id.as_deref(), Some("01JXYZ123456789ABCDEFGHIJK"));
1300        assert_eq!(result.name, "Mark Bonnick");
1301        assert!(result.body.contains("- qualifier: Arsenal Kit Manager"));
1302        assert!(result.body.contains("- nationality: British"));
1303    }
1304
1305    #[test]
1306    fn parse_entity_file_without_id() {
1307        let input = [
1308            "---",
1309            "---",
1310            "",
1311            "# Arsenal FC",
1312            "",
1313            "- qualifier: English Football Club",
1314            "- org_type: sports_club",
1315            "",
1316        ]
1317        .join("\n");
1318
1319        let result = parse_entity_file(&input).unwrap();
1320        assert!(result.id.is_none());
1321        assert_eq!(result.name, "Arsenal FC");
1322    }
1323
1324    #[test]
1325    fn parse_entity_file_no_front_matter() {
1326        let input = ["# Bob Smith", "", "- nationality: Dutch", ""].join("\n");
1327
1328        let result = parse_entity_file(&input).unwrap();
1329        assert!(result.id.is_none());
1330        assert_eq!(result.name, "Bob Smith");
1331        assert!(result.body.contains("- nationality: Dutch"));
1332    }
1333
1334    #[test]
1335    fn parse_entity_file_rejects_h2_sections() {
1336        let input = [
1337            "---",
1338            "---",
1339            "",
1340            "# Test Entity",
1341            "",
1342            "## Relationships",
1343            "",
1344        ]
1345        .join("\n");
1346
1347        let errs = parse_entity_file(&input).unwrap_err();
1348        assert!(errs.iter().any(|e| e.message.contains("H2 sections")));
1349    }
1350
1351    #[test]
1352    fn parse_entity_file_missing_h1() {
1353        let input = ["---", "---", "", "- nationality: Dutch", ""].join("\n");
1354
1355        let errs = parse_entity_file(&input).unwrap_err();
1356        assert!(errs.iter().any(|e| e.message.contains("missing H1")));
1357    }
1358
1359    #[test]
1360    fn parse_related_cases_section() {
1361        let input = [
1362            "---",
1363            "tags: [bribery]",
1364            "sources:",
1365            "  - https://example.com",
1366            "---",
1367            "",
1368            "# Test Case",
1369            "",
1370            "Summary text.",
1371            "",
1372            "## Related Cases",
1373            "",
1374            "- id/corruption/2002/blbi-liquidity-aid-scandal",
1375            "  description: Artalyta bribed Urip to influence the BLBI investigation",
1376            "- id/corruption/2008/another-case",
1377            "  description: A second related case",
1378        ]
1379        .join("\n");
1380
1381        let case = parse(&input).unwrap_or_else(|errs| {
1382            panic!(
1383                "parse failed: {}",
1384                errs.iter()
1385                    .map(ToString::to_string)
1386                    .collect::<Vec<_>>()
1387                    .join("; ")
1388            );
1389        });
1390
1391        assert_eq!(case.related_cases.len(), 2);
1392        assert_eq!(
1393            case.related_cases[0].case_path,
1394            "id/corruption/2002/blbi-liquidity-aid-scandal"
1395        );
1396        assert_eq!(
1397            case.related_cases[0].description,
1398            "Artalyta bribed Urip to influence the BLBI investigation"
1399        );
1400        assert_eq!(
1401            case.related_cases[1].case_path,
1402            "id/corruption/2008/another-case"
1403        );
1404        assert_eq!(case.related_cases[1].description, "A second related case");
1405        // RelatedCases should be consumed and NOT appear in sections
1406        assert!(
1407            !case
1408                .sections
1409                .iter()
1410                .any(|s| s.kind == SectionKind::RelatedCases)
1411        );
1412    }
1413
1414    #[test]
1415    fn parse_related_cases_empty_path() {
1416        let input = [
1417            "---",
1418            "sources: []",
1419            "---",
1420            "",
1421            "# Title",
1422            "",
1423            "## Related Cases",
1424            "",
1425            "- ",
1426            "  description: Some description",
1427        ]
1428        .join("\n");
1429
1430        let errs = parse(&input).unwrap_err();
1431        assert!(
1432            errs.iter()
1433                .any(|e| e.message.contains("case path must not be empty"))
1434        );
1435    }
1436
1437    #[test]
1438    fn parse_related_cases_missing_description() {
1439        let input = [
1440            "---",
1441            "sources: []",
1442            "---",
1443            "",
1444            "# Title",
1445            "",
1446            "## Related Cases",
1447            "",
1448            "- id/corruption/2002/some-case",
1449        ]
1450        .join("\n");
1451
1452        let errs = parse(&input).unwrap_err();
1453        assert!(errs.iter().any(|e| e.message.contains("description")));
1454    }
1455
1456    #[test]
1457    fn parse_related_cases_description_too_long() {
1458        let long_desc = "x".repeat(501);
1459        let input = [
1460            "---",
1461            "sources: []",
1462            "---",
1463            "",
1464            "# Title",
1465            "",
1466            "## Related Cases",
1467            "",
1468            "- id/corruption/2002/some-case",
1469            &format!("  description: {long_desc}"),
1470        ]
1471        .join("\n");
1472
1473        let errs = parse(&input).unwrap_err();
1474        assert!(errs.iter().any(|e| e.message.contains("exceeds 500")));
1475    }
1476
1477    #[test]
1478    fn parse_related_cases_too_many() {
1479        let mut lines = vec![
1480            "---".to_string(),
1481            "sources: []".to_string(),
1482            "---".to_string(),
1483            String::new(),
1484            "# Title".to_string(),
1485            String::new(),
1486            "## Related Cases".to_string(),
1487            String::new(),
1488        ];
1489        for i in 0..11 {
1490            lines.push(format!("- id/corruption/2002/case-{i}"));
1491            lines.push(format!("  description: Description {i}"));
1492        }
1493        let input = lines.join("\n");
1494
1495        let errs = parse(&input).unwrap_err();
1496        assert!(errs.iter().any(|e| e.message.contains("exceeds 10")));
1497    }
1498}
weave_content/parser.rs

weave_content/
parser.rs