weave_content/
parser.rs

1#![allow(clippy::module_name_repetitions)]
2
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7/// Maximum length of a case NULID (26 chars Crockford Base32).
8const MAX_CASE_ID_LEN: usize = 26;
9
10/// Maximum number of sources in front matter.
11const MAX_SOURCES: usize = 20;
12
13/// Maximum length of the case title (H1).
14const MAX_TITLE_LEN: usize = 200;
15
16/// Maximum length of the case summary.
17const MAX_SUMMARY_LEN: usize = 2000;
18
19/// Known H2 section names for case files (case-insensitive match).
20/// People and Organizations are no longer allowed in case files -- they
21/// live in standalone entity files under `people/` and `organizations/`.
22const KNOWN_CASE_SECTIONS: &[&str] = &[
23    "Events",
24    "Documents",
25    "Assets",
26    "Relationships",
27    "Timeline",
28    "Related Cases",
29];
30
31/// A parsed case file with front matter, title, summary, and raw sections.
32#[derive(Debug)]
33pub struct ParsedCase {
34    /// NULID for the case node (None if not yet generated).
35    pub id: Option<String>,
36    pub sources: Vec<SourceEntry>,
37    pub title: String,
38    pub summary: String,
39    pub sections: Vec<Section>,
40    /// Case type from front matter (e.g. `corruption`, `fraud`).
41    pub case_type: Option<String>,
42    /// Case status from front matter (e.g. `open`, `trial`).
43    pub status: Option<String>,
44    /// Structured amounts DSL string (e.g. `660000 USD bribe | 250000000 IDR fine`).
45    pub amounts: Option<String>,
46    /// Tags from front matter for categorization.
47    pub tags: Vec<String>,
48    /// Related case entries from `## Related Cases` section.
49    pub related_cases: Vec<RelatedCase>,
50    /// Involved entity entries from `## Involved` section.
51    pub involved: Vec<InvolvedEntry>,
52}
53
54/// A related case entry from `## Related Cases` section.
55#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
56pub struct RelatedCase {
57    /// Case path relative to content root (e.g. `id/corruption/2002/blbi-liquidity-aid-scandal`).
58    pub case_path: String,
59    /// Description of the relationship between the cases.
60    pub description: String,
61    /// NULID for the `related_to` relationship (auto-generated on first build).
62    #[serde(skip_serializing_if = "Option::is_none")]
63    pub id: Option<String>,
64    /// Line number (1-indexed) where this entry appears in the original file.
65    #[serde(skip)]
66    pub line: usize,
67}
68
69/// An entity reference in the `## Involved` section.
70#[derive(Debug, Clone, PartialEq, Eq)]
71pub struct InvolvedEntry {
72    /// Entity name (must match a registry entity referenced in the case).
73    pub entity_name: String,
74    /// NULID for the `involved_in` relationship (auto-generated on first build).
75    pub id: Option<String>,
76    /// Line number (1-indexed) where this entry appears in the original file.
77    pub line: usize,
78}
79
80/// A raw H2 section with its heading text and body content.
81#[derive(Debug)]
82pub struct Section {
83    pub kind: SectionKind,
84    pub body: String,
85    /// Line number (1-indexed) where the H2 heading appears in the original file.
86    pub line: usize,
87}
88
89/// The type of an H2 section, mapped from heading text.
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum SectionKind {
92    People,
93    Organizations,
94    Events,
95    Documents,
96    Assets,
97    Relationships,
98    Timeline,
99    RelatedCases,
100    Involved,
101}
102
103impl SectionKind {
104    fn from_heading(heading: &str) -> Option<Self> {
105        match heading.trim() {
106            s if s.eq_ignore_ascii_case("People") => Some(Self::People),
107            s if s.eq_ignore_ascii_case("Organizations") => Some(Self::Organizations),
108            s if s.eq_ignore_ascii_case("Events") => Some(Self::Events),
109            s if s.eq_ignore_ascii_case("Documents") => Some(Self::Documents),
110            s if s.eq_ignore_ascii_case("Assets") => Some(Self::Assets),
111            s if s.eq_ignore_ascii_case("Relationships") => Some(Self::Relationships),
112            s if s.eq_ignore_ascii_case("Timeline") => Some(Self::Timeline),
113            s if s.eq_ignore_ascii_case("Related Cases") => Some(Self::RelatedCases),
114            s if s.eq_ignore_ascii_case("Involved") => Some(Self::Involved),
115            _ => None,
116        }
117    }
118
119    /// Whether this section kind is valid in case files.
120    /// People and Organizations are no longer allowed in case files.
121    pub fn is_case_section(self) -> bool {
122        matches!(
123            self,
124            Self::Events
125                | Self::Documents
126                | Self::Assets
127                | Self::Relationships
128                | Self::Timeline
129                | Self::RelatedCases
130                | Self::Involved
131        )
132    }
133}
134
135/// A parser error with file location.
136#[derive(Debug)]
137pub struct ParseError {
138    pub line: usize,
139    pub message: String,
140}
141
142impl fmt::Display for ParseError {
143    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
144        write!(f, "line {}: {}", self.line, self.message)
145    }
146}
147
148/// Maximum number of tags per case file.
149const MAX_CASE_TAGS: usize = 10;
150
151/// Maximum number of tags per entity file.
152const MAX_ENTITY_TAGS: usize = 5;
153
154/// Maximum length of a single tag.
155const MAX_TAG_LEN: usize = 50;
156
157/// Maximum number of related case entries per case file.
158const MAX_RELATED_CASES: usize = 10;
159
160/// Maximum length of a related case description.
161const MAX_RELATED_DESCRIPTION_LEN: usize = 500;
162
163/// Parse the body of a `## Related Cases` section into `RelatedCase` entries.
164///
165/// Each entry is a bullet `- <case_path>` followed by indented fields:
166/// `description: <text>` (required) and `id: <NULID>` (optional, written back).
167pub fn parse_related_cases(
168    body: &str,
169    section_start_line: usize,
170    errors: &mut Vec<ParseError>,
171) -> Vec<RelatedCase> {
172    let mut entries: Vec<(String, String, Option<String>, usize)> = Vec::new(); // (path, desc, id, line)
173
174    for (offset, line) in body.lines().enumerate() {
175        let file_line = section_start_line + offset + 1;
176
177        if let Some(rest) = line.strip_prefix("- ") {
178            let case_path = rest.trim().to_string();
179            entries.push((case_path, String::new(), None, file_line));
180        } else if let Some(rest) = line.strip_prefix("  description: ") {
181            if let Some(entry) = entries.last_mut() {
182                entry.1 = rest.trim().to_string();
183            } else {
184                errors.push(ParseError {
185                    line: file_line,
186                    message: "description without a preceding case path".into(),
187                });
188            }
189        } else if let Some(rest) = line.strip_prefix("  id: ") {
190            if let Some(entry) = entries.last_mut() {
191                entry.2 = Some(rest.trim().to_string());
192            } else {
193                errors.push(ParseError {
194                    line: file_line,
195                    message: "id without a preceding case path".into(),
196                });
197            }
198        } else if !line.trim().is_empty() {
199            errors.push(ParseError {
200                line: file_line,
201                message: format!("unexpected line in Related Cases: {line}"),
202            });
203        }
204    }
205
206    if entries.len() > MAX_RELATED_CASES {
207        errors.push(ParseError {
208            line: section_start_line,
209            message: format!(
210                "Related Cases exceeds {MAX_RELATED_CASES} entries (got {})",
211                entries.len()
212            ),
213        });
214    }
215
216    let mut result = Vec::new();
217    for (case_path, description, id, line) in entries {
218        if case_path.is_empty() {
219            errors.push(ParseError {
220                line,
221                message: "related case path must not be empty".into(),
222            });
223            continue;
224        }
225        if description.is_empty() {
226            errors.push(ParseError {
227                line,
228                message: format!("related case {case_path:?} missing description"),
229            });
230            continue;
231        }
232        if description.len() > MAX_RELATED_DESCRIPTION_LEN {
233            errors.push(ParseError {
234                line,
235                message: format!(
236                    "related case description exceeds {MAX_RELATED_DESCRIPTION_LEN} chars (got {})",
237                    description.len()
238                ),
239            });
240            continue;
241        }
242        result.push(RelatedCase {
243            case_path,
244            description,
245            id,
246            line,
247        });
248    }
249
250    result
251}
252
253/// Maximum number of entries in `## Involved` section.
254const MAX_INVOLVED: usize = 50;
255
256/// Parse the body of a `## Involved` section into `InvolvedEntry` items.
257///
258/// Format:
259/// ```text
260/// - Entity Name
261///   id: 01ABC...
262/// ```
263pub fn parse_involved(
264    body: &str,
265    section_start_line: usize,
266    errors: &mut Vec<ParseError>,
267) -> Vec<InvolvedEntry> {
268    let mut entries = Vec::new();
269    let lines: Vec<&str> = body.lines().collect();
270
271    let mut i = 0;
272    while i < lines.len() {
273        let file_line = section_start_line + 1 + i;
274        let trimmed = lines[i].trim();
275
276        if trimmed.is_empty() {
277            i += 1;
278            continue;
279        }
280
281        let Some(name) = trimmed.strip_prefix("- ") else {
282            errors.push(ParseError {
283                line: file_line,
284                message: format!("expected involved entry `- Entity Name`, got {trimmed:?}"),
285            });
286            i += 1;
287            continue;
288        };
289
290        let entity_name = name.trim().to_string();
291        if entity_name.is_empty() {
292            errors.push(ParseError {
293                line: file_line,
294                message: "involved entity name must not be empty".into(),
295            });
296            i += 1;
297            continue;
298        }
299
300        // Look ahead for `id:` on the next line
301        let mut id: Option<String> = None;
302        if i + 1 < lines.len() {
303            let next = lines[i + 1].trim();
304            if let Some(id_val) = next.strip_prefix("id: ") {
305                id = Some(id_val.trim().to_string());
306                i += 1;
307            }
308        }
309
310        entries.push(InvolvedEntry {
311            entity_name,
312            id,
313            line: file_line,
314        });
315
316        i += 1;
317    }
318
319    if entries.len() > MAX_INVOLVED {
320        errors.push(ParseError {
321            line: section_start_line,
322            message: format!(
323                "Involved exceeds {MAX_INVOLVED} entries (got {})",
324                entries.len()
325            ),
326        });
327    }
328
329    entries
330}
331
332/// YAML front matter schema.
333#[derive(Deserialize)]
334struct FrontMatter {
335    /// NULID for the case node (auto-generated on first build).
336    #[serde(default)]
337    id: Option<String>,
338    #[serde(default)]
339    sources: Vec<SourceEntry>,
340    #[serde(default)]
341    case_type: Option<String>,
342    #[serde(default)]
343    status: Option<String>,
344    #[serde(default)]
345    amounts: Option<String>,
346    #[serde(default)]
347    tags: Vec<String>,
348}
349
350/// A source entry in front matter. Supports both bare URL strings and
351/// structured objects with metadata.
352#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
353#[serde(untagged)]
354pub enum SourceEntry {
355    /// Plain URL string (backward-compatible).
356    Url(String),
357    /// Structured source with metadata.
358    Structured {
359        url: String,
360        #[serde(default)]
361        title: Option<String>,
362        #[serde(default)]
363        published_at: Option<String>,
364        #[serde(default)]
365        language: Option<String>,
366    },
367}
368
369impl SourceEntry {
370    /// Get the URL from any source entry variant.
371    pub fn url(&self) -> &str {
372        match self {
373            Self::Url(u) => u,
374            Self::Structured { url, .. } => url,
375        }
376    }
377}
378
379/// YAML front matter schema for standalone entity files.
380/// Only contains an optional `id` field (NULID, generated on first build).
381#[derive(Deserialize)]
382struct EntityFrontMatter {
383    #[serde(default)]
384    id: Option<String>,
385    #[serde(default)]
386    tags: Vec<String>,
387}
388
389/// A parsed standalone entity file (actor or institution).
390#[derive(Debug)]
391pub struct ParsedEntityFile {
392    /// Stored NULID from front matter (None if not yet generated).
393    pub id: Option<String>,
394    /// Entity name from H1 heading.
395    pub name: String,
396    /// Raw bullet field lines (body after H1, no sections).
397    pub body: String,
398    /// Line number of the H1 heading in the original file.
399    pub title_line: usize,
400    /// Tags from front matter.
401    pub tags: Vec<String>,
402}
403
404/// Parse a Markdown case file into a `ParsedCase`.
405///
406/// Extracts YAML front matter, H1 title, summary, and H2 sections.
407/// Returns errors for malformed structure or boundary violations.
408pub fn parse(input: &str) -> Result<ParsedCase, Vec<ParseError>> {
409    let mut errors = Vec::new();
410
411    // Extract front matter
412    let (front_matter, body_start_line, body) = extract_front_matter(input, &mut errors);
413
414    let Some(front_matter) = front_matter else {
415        if errors.is_empty() {
416            errors.push(ParseError {
417                line: 1,
418                message: "missing YAML front matter (expected `---` delimiter)".into(),
419            });
420        }
421        return Err(errors);
422    };
423
424    // Validate front matter fields
425    validate_front_matter(&front_matter, &mut errors);
426
427    // Extract title, summary, and sections from body
428    let (title, summary, mut sections) = extract_body(&body, body_start_line, &mut errors);
429
430    // Parse Related Cases sections
431    let mut related_cases = Vec::new();
432    for section in &sections {
433        if section.kind == SectionKind::RelatedCases {
434            let entries = parse_related_cases(&section.body, section.line, &mut errors);
435            related_cases.extend(entries);
436        }
437    }
438    // Remove RelatedCases from sections list (consumed)
439    sections.retain(|s| s.kind != SectionKind::RelatedCases);
440
441    // Parse Involved sections
442    let mut involved = Vec::new();
443    for section in &sections {
444        if section.kind == SectionKind::Involved {
445            let entries = parse_involved(&section.body, section.line, &mut errors);
446            involved.extend(entries);
447        }
448    }
449    // Remove Involved from sections list (consumed)
450    sections.retain(|s| s.kind != SectionKind::Involved);
451
452    if !errors.is_empty() {
453        return Err(errors);
454    }
455
456    Ok(ParsedCase {
457        id: front_matter.id,
458        sources: front_matter.sources,
459        title,
460        summary,
461        sections,
462        case_type: front_matter.case_type,
463        status: front_matter.status,
464        amounts: front_matter.amounts,
465        tags: front_matter.tags,
466        related_cases,
467        involved,
468    })
469}
470
471/// Parse a standalone entity file (actor or institution).
472///
473/// Entity files have YAML front matter with optional `id:`, an H1 name,
474/// and bullet fields directly in the body. No H2 sections are allowed.
475pub fn parse_entity_file(input: &str) -> Result<ParsedEntityFile, Vec<ParseError>> {
476    let mut errors = Vec::new();
477
478    let (front_matter, body_start_line, body) = extract_entity_front_matter(input, &mut errors);
479
480    let id = front_matter.as_ref().and_then(|fm| fm.id.clone());
481    let tags = front_matter.map_or_else(Vec::new, |fm| fm.tags);
482
483    // Validate entity tags
484    if tags.len() > MAX_ENTITY_TAGS {
485        errors.push(ParseError {
486            line: 2,
487            message: format!(
488                "front matter `tags` exceeds {MAX_ENTITY_TAGS} entries (got {})",
489                tags.len()
490            ),
491        });
492    }
493    for (i, tag) in tags.iter().enumerate() {
494        if tag.len() > MAX_TAG_LEN {
495            errors.push(ParseError {
496                line: 2,
497                message: format!("front matter tag #{} exceeds {MAX_TAG_LEN} chars", i + 1),
498            });
499        }
500        if tag.is_empty() {
501            errors.push(ParseError {
502                line: 2,
503                message: format!("front matter tag #{} is empty", i + 1),
504            });
505        }
506    }
507
508    // Extract H1 title and body content (no sections allowed)
509    let (name, title_line, field_body) = extract_entity_body(&body, body_start_line, &mut errors);
510
511    if !errors.is_empty() {
512        return Err(errors);
513    }
514
515    Ok(ParsedEntityFile {
516        id,
517        name,
518        body: field_body,
519        title_line,
520        tags,
521    })
522}
523
524/// Extract YAML front matter for entity files.
525/// Front matter is optional for entity files -- if absent, returns None with no error.
526fn extract_entity_front_matter(
527    input: &str,
528    errors: &mut Vec<ParseError>,
529) -> (Option<EntityFrontMatter>, usize, String) {
530    let lines: Vec<&str> = input.lines().collect();
531
532    let first_delim = lines.iter().position(|l| l.trim() == "---");
533    if first_delim != Some(0) {
534        // No front matter -- entire file is body, starting at line 1
535        return (None, 1, input.to_string());
536    }
537
538    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
539    let Some(close_offset) = close_delim else {
540        errors.push(ParseError {
541            line: 1,
542            message: "unclosed YAML front matter (missing closing `---`)".into(),
543        });
544        return (None, 1, String::new());
545    };
546
547    let close_line = close_offset + 1;
548    let yaml_str: String = lines[1..close_line].join("\n");
549    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
550    let body = lines[close_line + 1..].join("\n");
551
552    match serde_yaml::from_str::<EntityFrontMatter>(&yaml_str) {
553        Ok(fm) => (Some(fm), body_start_line, body),
554        Err(e) => {
555            errors.push(ParseError {
556                line: 2,
557                message: format!("invalid YAML front matter: {e}"),
558            });
559            (None, body_start_line, body)
560        }
561    }
562}
563
564/// Extract H1 name and field body from an entity file.
565/// Rejects any H2 sections.
566fn extract_entity_body(
567    body: &str,
568    body_start_line: usize,
569    errors: &mut Vec<ParseError>,
570) -> (String, usize, String) {
571    let lines: Vec<&str> = body.lines().collect();
572    let mut name = String::new();
573    let mut title_found = false;
574    let mut title_line = body_start_line;
575    let mut field_lines: Vec<&str> = Vec::new();
576
577    for (i, line) in lines.iter().enumerate() {
578        let file_line = body_start_line + i;
579
580        if let Some(heading) = strip_heading(line, 1) {
581            if title_found {
582                errors.push(ParseError {
583                    line: file_line,
584                    message: "multiple H1 headings found (expected exactly one)".into(),
585                });
586                continue;
587            }
588            name = heading.to_string();
589            title_found = true;
590            title_line = file_line;
591            continue;
592        }
593
594        // Reject H2 sections in entity files
595        if strip_heading(line, 2).is_some() {
596            errors.push(ParseError {
597                line: file_line,
598                message: "H2 sections are not allowed in entity files".into(),
599            });
600            continue;
601        }
602
603        if title_found {
604            field_lines.push(line);
605        } else if !line.trim().is_empty() {
606            errors.push(ParseError {
607                line: file_line,
608                message: "expected H1 heading (# Name)".into(),
609            });
610        }
611    }
612
613    if !title_found {
614        errors.push(ParseError {
615            line: body_start_line,
616            message: "missing H1 heading".into(),
617        });
618    } else if name.len() > MAX_TITLE_LEN {
619        errors.push(ParseError {
620            line: title_line,
621            message: format!("H1 name exceeds {MAX_TITLE_LEN} chars (got {})", name.len()),
622        });
623    }
624
625    (name, title_line, field_lines.join("\n"))
626}
627
628/// Extract YAML front matter delimited by `---` lines.
629/// Returns the parsed front matter, the line number where the body starts,
630/// and the body text.
631fn extract_front_matter(
632    input: &str,
633    errors: &mut Vec<ParseError>,
634) -> (Option<FrontMatter>, usize, String) {
635    let lines: Vec<&str> = input.lines().collect();
636
637    // First non-empty line must be `---`
638    let first_delim = lines.iter().position(|l| l.trim() == "---");
639    if first_delim != Some(0) {
640        errors.push(ParseError {
641            line: 1,
642            message: "missing YAML front matter (expected `---` on first line)".into(),
643        });
644        return (None, 1, input.to_string());
645    }
646
647    // Find closing `---`
648    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
649    let Some(close_offset) = close_delim else {
650        errors.push(ParseError {
651            line: 1,
652            message: "unclosed YAML front matter (missing closing `---`)".into(),
653        });
654        return (None, 1, String::new());
655    };
656
657    let close_line = close_offset + 1; // index in `lines`
658    let yaml_str: String = lines[1..close_line].join("\n");
659    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
660    let body = lines[close_line + 1..].join("\n");
661
662    match serde_yaml::from_str::<FrontMatter>(&yaml_str) {
663        Ok(fm) => (Some(fm), body_start_line, body),
664        Err(e) => {
665            errors.push(ParseError {
666                line: 2,
667                message: format!("invalid YAML front matter: {e}"),
668            });
669            (None, body_start_line, body)
670        }
671    }
672}
673
674fn validate_front_matter(fm: &FrontMatter, errors: &mut Vec<ParseError>) {
675    // Validate case ID (NULID) if present
676    if let Some(id) = &fm.id
677        && id.len() != MAX_CASE_ID_LEN
678    {
679        errors.push(ParseError {
680            line: 2,
681            message: format!(
682                "front matter `id` must be a {MAX_CASE_ID_LEN}-char NULID, got {} chars",
683                id.len()
684            ),
685        });
686    }
687
688    // Validate sources count
689    if fm.sources.len() > MAX_SOURCES {
690        errors.push(ParseError {
691            line: 2,
692            message: format!(
693                "front matter `sources` exceeds {MAX_SOURCES} entries (got {})",
694                fm.sources.len()
695            ),
696        });
697    }
698
699    // Validate each source URL is HTTPS
700    for (i, source) in fm.sources.iter().enumerate() {
701        if !source.url().starts_with("https://") {
702            errors.push(ParseError {
703                line: 2,
704                message: format!("source[{i}] must be HTTPS, got {:?}", source.url()),
705            });
706        }
707    }
708
709    // Validate case_type
710    if let Some(ct) = &fm.case_type {
711        use crate::domain::CaseType;
712        let normalized = ct.to_lowercase().replace(' ', "_");
713        if !CaseType::KNOWN.contains(&normalized.as_str())
714            && crate::domain::parse_custom(ct).is_none()
715        {
716            errors.push(ParseError {
717                line: 2,
718                message: format!(
719                    "invalid case_type {:?} (known: {}; use \"custom:Value\" for custom)",
720                    ct,
721                    CaseType::KNOWN.join(", ")
722                ),
723            });
724        }
725    }
726
727    // Validate status
728    if let Some(st) = &fm.status {
729        use crate::domain::CaseStatus;
730        let normalized = st.to_lowercase().replace(' ', "_");
731        if !CaseStatus::KNOWN.contains(&normalized.as_str()) {
732            errors.push(ParseError {
733                line: 2,
734                message: format!(
735                    "invalid status {:?} (known: {})",
736                    st,
737                    CaseStatus::KNOWN.join(", ")
738                ),
739            });
740        }
741    }
742
743    // Validate tags
744    if fm.tags.len() > MAX_CASE_TAGS {
745        errors.push(ParseError {
746            line: 2,
747            message: format!(
748                "front matter `tags` exceeds {MAX_CASE_TAGS} entries (got {})",
749                fm.tags.len()
750            ),
751        });
752    }
753    for (i, tag) in fm.tags.iter().enumerate() {
754        if tag.len() > MAX_TAG_LEN {
755            errors.push(ParseError {
756                line: 2,
757                message: format!("tag[{i}] exceeds {MAX_TAG_LEN} chars (got {})", tag.len()),
758            });
759        }
760        if tag.is_empty() {
761            errors.push(ParseError {
762                line: 2,
763                message: format!("tag[{i}] must not be empty"),
764            });
765        }
766    }
767}
768
769/// Extract the H1 title, summary text, and H2 sections from the body.
770#[allow(clippy::too_many_lines)]
771fn extract_body(
772    body: &str,
773    body_start_line: usize,
774    errors: &mut Vec<ParseError>,
775) -> (String, String, Vec<Section>) {
776    let lines: Vec<&str> = body.lines().collect();
777    let mut title = String::new();
778    let mut title_found = false;
779    let mut summary_lines: Vec<&str> = Vec::new();
780    let mut sections: Vec<Section> = Vec::new();
781
782    // Track current H2 section being built
783    let mut current_section_kind: Option<SectionKind> = None;
784    let mut current_section_line: usize = 0;
785    let mut current_section_body: Vec<&str> = Vec::new();
786
787    // State: before H1, after H1 (summary), in sections
788    let mut state = State::BeforeTitle;
789
790    for (i, line) in lines.iter().enumerate() {
791        let file_line = body_start_line + i; // 1-indexed line in original file
792
793        if let Some(heading) = strip_heading(line, 1) {
794            if title_found {
795                errors.push(ParseError {
796                    line: file_line,
797                    message: "multiple H1 headings found (expected exactly one)".into(),
798                });
799                continue;
800            }
801            title = heading.to_string();
802            title_found = true;
803            state = State::Summary;
804            continue;
805        }
806
807        if let Some(heading) = strip_heading(line, 2) {
808            // Flush previous section
809            if let Some(kind) = current_section_kind.take() {
810                sections.push(Section {
811                    kind,
812                    body: current_section_body.join("\n"),
813                    line: current_section_line,
814                });
815                current_section_body.clear();
816            }
817
818            match SectionKind::from_heading(heading) {
819                Some(kind) if kind.is_case_section() => {
820                    // Check for duplicate sections
821                    if sections.iter().any(|s| s.kind == kind) {
822                        errors.push(ParseError {
823                            line: file_line,
824                            message: format!("duplicate section: ## {heading}"),
825                        });
826                    }
827                    current_section_kind = Some(kind);
828                    current_section_line = file_line;
829                    state = State::InSection;
830                }
831                Some(_) => {
832                    // Legacy section (People/Organizations) -- not allowed in case files
833                    errors.push(ParseError {
834                        line: file_line,
835                        message: format!(
836                            "## {heading} is not allowed in case files (use standalone entity files in people/ or organizations/ instead)"
837                        ),
838                    });
839                }
840                None => {
841                    errors.push(ParseError {
842                        line: file_line,
843                        message: format!(
844                            "unknown section: ## {heading} (expected one of: {})",
845                            KNOWN_CASE_SECTIONS.join(", ")
846                        ),
847                    });
848                }
849            }
850            continue;
851        }
852
853        match state {
854            State::BeforeTitle => {
855                // Skip blank lines before title
856                if !line.trim().is_empty() {
857                    errors.push(ParseError {
858                        line: file_line,
859                        message: "expected H1 title (# Title)".into(),
860                    });
861                }
862            }
863            State::Summary => {
864                summary_lines.push(line);
865            }
866            State::InSection => {
867                current_section_body.push(line);
868            }
869        }
870    }
871
872    // Flush last section
873    if let Some(kind) = current_section_kind.take() {
874        sections.push(Section {
875            kind,
876            body: current_section_body.join("\n"),
877            line: current_section_line,
878        });
879    }
880
881    // Validate title
882    if !title_found {
883        errors.push(ParseError {
884            line: body_start_line,
885            message: "missing H1 title".into(),
886        });
887    } else if title.len() > MAX_TITLE_LEN {
888        errors.push(ParseError {
889            line: body_start_line,
890            message: format!(
891                "H1 title exceeds {MAX_TITLE_LEN} chars (got {})",
892                title.len()
893            ),
894        });
895    }
896
897    // Build summary (trim leading/trailing blank lines)
898    let summary = summary_lines.clone().join("\n").trim().to_string();
899
900    if summary.len() > MAX_SUMMARY_LEN {
901        errors.push(ParseError {
902            line: body_start_line,
903            message: format!(
904                "summary exceeds {MAX_SUMMARY_LEN} chars (got {})",
905                summary.len()
906            ),
907        });
908    }
909
910    (title, summary, sections)
911}
912
913#[derive(Clone, Copy)]
914enum State {
915    BeforeTitle,
916    Summary,
917    InSection,
918}
919
920/// Strip an ATX heading prefix of the given level. Returns the heading text.
921/// E.g., `strip_heading("## Foo", 2)` returns `Some("Foo")`.
922fn strip_heading(line: &str, level: usize) -> Option<&str> {
923    let prefix = "#".repeat(level);
924    let trimmed = line.trim_start();
925    if trimmed.starts_with(&prefix) {
926        let after = &trimmed[prefix.len()..];
927        // Must be followed by space or end of line, and NOT more `#` chars
928        if after.is_empty() {
929            return Some("");
930        }
931        if after.starts_with(' ') && !after.starts_with(" #") {
932            // Actually, need to exclude `### Foo` when looking for `## Foo`
933            return Some(after[1..].trim());
934        }
935        // Check: `###` should not match `##`
936        if after.starts_with('#') {
937            return None;
938        }
939    }
940    None
941}
942
943#[cfg(test)]
944mod tests {
945    use super::*;
946
947    fn minimal_case() -> String {
948        [
949            "---",
950            "id: 01H9XT7H1J3929RK32FWSRKV88",
951            "sources:",
952            "  - https://example.com/source",
953            "---",
954            "",
955            "# Test Case Title",
956            "",
957            "This is the summary.",
958            "",
959            "## Events",
960            "",
961            "### Something happened",
962            "- occurred_at: 2025-01-01",
963            "",
964            "## Relationships",
965            "",
966            "- Something happened -> Something happened: associate_of",
967        ]
968        .join("\n")
969    }
970
971    #[test]
972    fn parse_minimal_case() {
973        let result = parse(&minimal_case());
974        let case = result.unwrap_or_else(|errs| {
975            panic!(
976                "parse failed: {}",
977                errs.iter()
978                    .map(ToString::to_string)
979                    .collect::<Vec<_>>()
980                    .join("; ")
981            );
982        });
983
984        assert_eq!(case.id.as_deref(), Some("01H9XT7H1J3929RK32FWSRKV88"));
985        assert_eq!(case.sources.len(), 1);
986        assert_eq!(case.sources[0].url(), "https://example.com/source");
987        assert_eq!(case.title, "Test Case Title");
988        assert_eq!(case.summary, "This is the summary.");
989        assert_eq!(case.sections.len(), 2);
990        assert_eq!(case.sections[0].kind, SectionKind::Events);
991        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
992    }
993
994    #[test]
995    fn parse_missing_front_matter() {
996        let input = "# Title\n\nSummary.\n";
997        let errs = parse(input).unwrap_err();
998        assert!(errs.iter().any(|e| e.message.contains("front matter")));
999    }
1000
1001    #[test]
1002    fn parse_unclosed_front_matter() {
1003        let input = "---\nsources: []\n# Title\n";
1004        let errs = parse(input).unwrap_err();
1005        assert!(errs.iter().any(|e| e.message.contains("unclosed")));
1006    }
1007
1008    #[test]
1009    fn parse_invalid_case_id_wrong_length() {
1010        let input = "---\nid: short\nsources: []\n---\n\n# Title\n";
1011        let errs = parse(input).unwrap_err();
1012        assert!(errs.iter().any(|e| e.message.contains("NULID")));
1013    }
1014
1015    #[test]
1016    fn parse_case_id_absent_is_ok() {
1017        let input = "---\nsources:\n  - https://example.com\n---\n\n# Title\n\nSummary.\n";
1018        let case = parse(input).unwrap();
1019        assert!(case.id.is_none());
1020    }
1021
1022    #[test]
1023    fn parse_non_https_source() {
1024        let input = "---\nsources:\n  - http://example.com\n---\n\n# Title\n";
1025        let errs = parse(input).unwrap_err();
1026        assert!(errs.iter().any(|e| e.message.contains("HTTPS")));
1027    }
1028
1029    #[test]
1030    fn parse_too_many_sources() {
1031        let sources: Vec<String> = (0..21)
1032            .map(|i| format!("  - https://example.com/{i}"))
1033            .collect();
1034        let input = format!("---\nsources:\n{}\n---\n\n# Title\n", sources.join("\n"));
1035        let errs = parse(&input).unwrap_err();
1036        assert!(errs.iter().any(|e| e.message.contains("exceeds 20")));
1037    }
1038
1039    #[test]
1040    fn parse_unknown_section() {
1041        let input = [
1042            "---",
1043            "sources: []",
1044            "---",
1045            "",
1046            "# Title",
1047            "",
1048            "## Unknown Section",
1049            "",
1050        ]
1051        .join("\n");
1052        let errs = parse(&input).unwrap_err();
1053        assert!(errs.iter().any(|e| e.message.contains("unknown section")));
1054    }
1055
1056    #[test]
1057    fn parse_duplicate_section() {
1058        let input = [
1059            "---",
1060            "sources: []",
1061            "---",
1062            "",
1063            "# Title",
1064            "",
1065            "## Events",
1066            "",
1067            "## Events",
1068            "",
1069        ]
1070        .join("\n");
1071        let errs = parse(&input).unwrap_err();
1072        assert!(errs.iter().any(|e| e.message.contains("duplicate")));
1073    }
1074
1075    #[test]
1076    fn parse_multiple_h1() {
1077        let input = [
1078            "---",
1079            "sources: []",
1080            "---",
1081            "",
1082            "# First Title",
1083            "",
1084            "# Second Title",
1085            "",
1086        ]
1087        .join("\n");
1088        let errs = parse(&input).unwrap_err();
1089        assert!(errs.iter().any(|e| e.message.contains("multiple H1")));
1090    }
1091
1092    #[test]
1093    fn parse_all_sections() {
1094        let input = [
1095            "---",
1096            "id: 01H9XT7H1KRQ9SJ7SD9ETB5CVQ",
1097            "sources:",
1098            "  - https://example.com/a",
1099            "---",
1100            "",
1101            "# Full Case",
1102            "",
1103            "Summary text here.",
1104            "",
1105            "## Events",
1106            "",
1107            "### Something happened",
1108            "- occurred_at: 2025-01-01",
1109            "",
1110            "## Relationships",
1111            "",
1112            "- Alice -> Corp Inc: employed_by",
1113            "",
1114            "## Timeline",
1115            "",
1116            "Something happened",
1117        ]
1118        .join("\n");
1119
1120        let case = parse(&input).unwrap_or_else(|errs| {
1121            panic!(
1122                "parse failed: {}",
1123                errs.iter()
1124                    .map(ToString::to_string)
1125                    .collect::<Vec<_>>()
1126                    .join("; ")
1127            );
1128        });
1129
1130        assert_eq!(case.id.as_deref(), Some("01H9XT7H1KRQ9SJ7SD9ETB5CVQ"));
1131        assert_eq!(case.title, "Full Case");
1132        assert_eq!(case.summary, "Summary text here.");
1133        assert_eq!(case.sections.len(), 3);
1134        assert_eq!(case.sections[0].kind, SectionKind::Events);
1135        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
1136        assert_eq!(case.sections[2].kind, SectionKind::Timeline);
1137    }
1138
1139    #[test]
1140    fn parse_empty_summary() {
1141        let input = [
1142            "---",
1143            "sources: []",
1144            "---",
1145            "",
1146            "# Title",
1147            "",
1148            "## Events",
1149            "",
1150        ]
1151        .join("\n");
1152
1153        let case = parse(&input).unwrap_or_else(|errs| {
1154            panic!(
1155                "parse failed: {}",
1156                errs.iter()
1157                    .map(ToString::to_string)
1158                    .collect::<Vec<_>>()
1159                    .join("; ")
1160            );
1161        });
1162        assert_eq!(case.summary, "");
1163    }
1164
1165    #[test]
1166    fn parse_multiline_summary() {
1167        let input = [
1168            "---",
1169            "sources: []",
1170            "---",
1171            "",
1172            "# Title",
1173            "",
1174            "First line of summary.",
1175            "Second line of summary.",
1176            "",
1177            "## Events",
1178            "",
1179        ]
1180        .join("\n");
1181
1182        let case = parse(&input).unwrap_or_else(|errs| {
1183            panic!(
1184                "parse failed: {}",
1185                errs.iter()
1186                    .map(ToString::to_string)
1187                    .collect::<Vec<_>>()
1188                    .join("; ")
1189            );
1190        });
1191        assert_eq!(
1192            case.summary,
1193            "First line of summary.\nSecond line of summary."
1194        );
1195    }
1196
1197    #[test]
1198    fn strip_heading_levels() {
1199        assert_eq!(strip_heading("# Title", 1), Some("Title"));
1200        assert_eq!(strip_heading("## Section", 2), Some("Section"));
1201        assert_eq!(strip_heading("### Entity", 3), Some("Entity"));
1202        // H3 should not match H2
1203        assert_eq!(strip_heading("### Entity", 2), None);
1204        // H2 should not match H1
1205        assert_eq!(strip_heading("## Section", 1), None);
1206        // Not a heading
1207        assert_eq!(strip_heading("Normal text", 1), None);
1208    }
1209
1210    #[test]
1211    fn section_body_content() {
1212        let input = [
1213            "---",
1214            "sources: []",
1215            "---",
1216            "",
1217            "# Title",
1218            "",
1219            "## Events",
1220            "",
1221            "### Bonnick dismissal",
1222            "- occurred_at: 2024-12-24",
1223            "- type: termination",
1224            "",
1225        ]
1226        .join("\n");
1227
1228        let case = parse(&input).unwrap_or_else(|errs| {
1229            panic!(
1230                "parse failed: {}",
1231                errs.iter()
1232                    .map(ToString::to_string)
1233                    .collect::<Vec<_>>()
1234                    .join("; ")
1235            );
1236        });
1237
1238        assert_eq!(case.sections.len(), 1);
1239        let body = &case.sections[0].body;
1240        assert!(body.contains("### Bonnick dismissal"));
1241        assert!(body.contains("- occurred_at: 2024-12-24"));
1242    }
1243
1244    #[test]
1245    fn parse_rejects_people_section_in_case_file() {
1246        let input = [
1247            "---",
1248            "sources: []",
1249            "---",
1250            "",
1251            "# Title",
1252            "",
1253            "## People",
1254            "",
1255        ]
1256        .join("\n");
1257        let errs = parse(&input).unwrap_err();
1258        assert!(
1259            errs.iter()
1260                .any(|e| e.message.contains("not allowed in case files"))
1261        );
1262    }
1263
1264    #[test]
1265    fn parse_rejects_organizations_section_in_case_file() {
1266        let input = [
1267            "---",
1268            "sources: []",
1269            "---",
1270            "",
1271            "# Title",
1272            "",
1273            "## Organizations",
1274            "",
1275        ]
1276        .join("\n");
1277        let errs = parse(&input).unwrap_err();
1278        assert!(
1279            errs.iter()
1280                .any(|e| e.message.contains("not allowed in case files"))
1281        );
1282    }
1283
1284    #[test]
1285    fn parse_entity_file_with_id() {
1286        let input = [
1287            "---",
1288            "id: 01JXYZ123456789ABCDEFGHIJK",
1289            "---",
1290            "",
1291            "# Mark Bonnick",
1292            "",
1293            "- qualifier: Arsenal Kit Manager",
1294            "- nationality: British",
1295            "",
1296        ]
1297        .join("\n");
1298
1299        let result = parse_entity_file(&input).unwrap();
1300        assert_eq!(result.id.as_deref(), Some("01JXYZ123456789ABCDEFGHIJK"));
1301        assert_eq!(result.name, "Mark Bonnick");
1302        assert!(result.body.contains("- qualifier: Arsenal Kit Manager"));
1303        assert!(result.body.contains("- nationality: British"));
1304    }
1305
1306    #[test]
1307    fn parse_entity_file_without_id() {
1308        let input = [
1309            "---",
1310            "---",
1311            "",
1312            "# Arsenal FC",
1313            "",
1314            "- qualifier: English Football Club",
1315            "- org_type: sports_club",
1316            "",
1317        ]
1318        .join("\n");
1319
1320        let result = parse_entity_file(&input).unwrap();
1321        assert!(result.id.is_none());
1322        assert_eq!(result.name, "Arsenal FC");
1323    }
1324
1325    #[test]
1326    fn parse_entity_file_no_front_matter() {
1327        let input = ["# Bob Smith", "", "- nationality: Dutch", ""].join("\n");
1328
1329        let result = parse_entity_file(&input).unwrap();
1330        assert!(result.id.is_none());
1331        assert_eq!(result.name, "Bob Smith");
1332        assert!(result.body.contains("- nationality: Dutch"));
1333    }
1334
1335    #[test]
1336    fn parse_entity_file_rejects_h2_sections() {
1337        let input = [
1338            "---",
1339            "---",
1340            "",
1341            "# Test Entity",
1342            "",
1343            "## Relationships",
1344            "",
1345        ]
1346        .join("\n");
1347
1348        let errs = parse_entity_file(&input).unwrap_err();
1349        assert!(errs.iter().any(|e| e.message.contains("H2 sections")));
1350    }
1351
1352    #[test]
1353    fn parse_entity_file_missing_h1() {
1354        let input = ["---", "---", "", "- nationality: Dutch", ""].join("\n");
1355
1356        let errs = parse_entity_file(&input).unwrap_err();
1357        assert!(errs.iter().any(|e| e.message.contains("missing H1")));
1358    }
1359
1360    #[test]
1361    fn parse_related_cases_section() {
1362        let input = [
1363            "---",
1364            "tags: [bribery]",
1365            "sources:",
1366            "  - https://example.com",
1367            "---",
1368            "",
1369            "# Test Case",
1370            "",
1371            "Summary text.",
1372            "",
1373            "## Related Cases",
1374            "",
1375            "- id/corruption/2002/blbi-liquidity-aid-scandal",
1376            "  description: Artalyta bribed Urip to influence the BLBI investigation",
1377            "- id/corruption/2008/another-case",
1378            "  description: A second related case",
1379        ]
1380        .join("\n");
1381
1382        let case = parse(&input).unwrap_or_else(|errs| {
1383            panic!(
1384                "parse failed: {}",
1385                errs.iter()
1386                    .map(ToString::to_string)
1387                    .collect::<Vec<_>>()
1388                    .join("; ")
1389            );
1390        });
1391
1392        assert_eq!(case.related_cases.len(), 2);
1393        assert_eq!(
1394            case.related_cases[0].case_path,
1395            "id/corruption/2002/blbi-liquidity-aid-scandal"
1396        );
1397        assert_eq!(
1398            case.related_cases[0].description,
1399            "Artalyta bribed Urip to influence the BLBI investigation"
1400        );
1401        assert_eq!(
1402            case.related_cases[1].case_path,
1403            "id/corruption/2008/another-case"
1404        );
1405        assert_eq!(case.related_cases[1].description, "A second related case");
1406        // RelatedCases should be consumed and NOT appear in sections
1407        assert!(
1408            !case
1409                .sections
1410                .iter()
1411                .any(|s| s.kind == SectionKind::RelatedCases)
1412        );
1413    }
1414
1415    #[test]
1416    fn parse_related_cases_empty_path() {
1417        let input = [
1418            "---",
1419            "sources: []",
1420            "---",
1421            "",
1422            "# Title",
1423            "",
1424            "## Related Cases",
1425            "",
1426            "- ",
1427            "  description: Some description",
1428        ]
1429        .join("\n");
1430
1431        let errs = parse(&input).unwrap_err();
1432        assert!(
1433            errs.iter()
1434                .any(|e| e.message.contains("case path must not be empty"))
1435        );
1436    }
1437
1438    #[test]
1439    fn parse_related_cases_missing_description() {
1440        let input = [
1441            "---",
1442            "sources: []",
1443            "---",
1444            "",
1445            "# Title",
1446            "",
1447            "## Related Cases",
1448            "",
1449            "- id/corruption/2002/some-case",
1450        ]
1451        .join("\n");
1452
1453        let errs = parse(&input).unwrap_err();
1454        assert!(errs.iter().any(|e| e.message.contains("description")));
1455    }
1456
1457    #[test]
1458    fn parse_related_cases_description_too_long() {
1459        let long_desc = "x".repeat(501);
1460        let input = [
1461            "---",
1462            "sources: []",
1463            "---",
1464            "",
1465            "# Title",
1466            "",
1467            "## Related Cases",
1468            "",
1469            "- id/corruption/2002/some-case",
1470            &format!("  description: {long_desc}"),
1471        ]
1472        .join("\n");
1473
1474        let errs = parse(&input).unwrap_err();
1475        assert!(errs.iter().any(|e| e.message.contains("exceeds 500")));
1476    }
1477
1478    #[test]
1479    fn parse_related_cases_too_many() {
1480        let mut lines = vec![
1481            "---".to_string(),
1482            "sources: []".to_string(),
1483            "---".to_string(),
1484            String::new(),
1485            "# Title".to_string(),
1486            String::new(),
1487            "## Related Cases".to_string(),
1488            String::new(),
1489        ];
1490        for i in 0..11 {
1491            lines.push(format!("- id/corruption/2002/case-{i}"));
1492            lines.push(format!("  description: Description {i}"));
1493        }
1494        let input = lines.join("\n");
1495
1496        let errs = parse(&input).unwrap_err();
1497        assert!(errs.iter().any(|e| e.message.contains("exceeds 10")));
1498    }
1499}
weave_content/parser.rs

weave_content/
parser.rs