weave_content/
parser.rs

1#![allow(clippy::module_name_repetitions)]
2
3use std::fmt;
4
5use serde::{Deserialize, Serialize};
6
7/// Maximum length of a case NULID (26 chars Crockford Base32).
8const MAX_CASE_ID_LEN: usize = 26;
9
10/// Maximum number of sources in front matter.
11const MAX_SOURCES: usize = 20;
12
13/// Maximum length of the case title (H1).
14const MAX_TITLE_LEN: usize = 200;
15
16/// Maximum length of the case summary.
17const MAX_SUMMARY_LEN: usize = 2000;
18
19/// Maximum length of the tagline (tweet-sized for sharing).
20const MAX_TAGLINE_LEN: usize = 280;
21
22/// Known H2 section names for case files (case-insensitive match).
23/// People and Organizations are no longer allowed in case files -- they
24/// live in standalone entity files under `people/` and `organizations/`.
25const KNOWN_CASE_SECTIONS: &[&str] = &[
26    "Events",
27    "Documents",
28    "Assets",
29    "Relationships",
30    "Timeline",
31    "Related Cases",
32];
33
34/// A parsed case file with front matter, title, summary, and raw sections.
35#[derive(Debug)]
36pub struct ParsedCase {
37    /// NULID for the case node (None if not yet generated).
38    pub id: Option<String>,
39    pub sources: Vec<SourceEntry>,
40    pub title: String,
41    pub summary: String,
42    pub sections: Vec<Section>,
43    /// Case type from front matter (e.g. `corruption`, `fraud`).
44    pub case_type: Option<String>,
45    /// Case status from front matter (e.g. `open`, `trial`).
46    pub status: Option<String>,
47    /// Structured amounts DSL string (e.g. `660000 USD bribe | 250000000 IDR fine`).
48    pub amounts: Option<String>,
49    /// Tags from front matter for categorization.
50    pub tags: Vec<String>,
51    /// Optional tagline for sharing / display (max 280 chars).
52    pub tagline: Option<String>,
53    /// Related case entries from `## Related Cases` section.
54    pub related_cases: Vec<RelatedCase>,
55    /// Involved entity entries from `## Involved` section.
56    pub involved: Vec<InvolvedEntry>,
57}
58
59/// A related case entry from `## Related Cases` section.
60#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
61pub struct RelatedCase {
62    /// Case path relative to content root (e.g. `id/corruption/2002/blbi-liquidity-aid-scandal`).
63    pub case_path: String,
64    /// Description of the relationship between the cases.
65    pub description: String,
66    /// NULID for the `related_to` relationship (auto-generated on first build).
67    #[serde(skip_serializing_if = "Option::is_none")]
68    pub id: Option<String>,
69    /// Line number (1-indexed) where this entry appears in the original file.
70    #[serde(skip)]
71    pub line: usize,
72}
73
74/// An entity reference in the `## Involved` section.
75#[derive(Debug, Clone, PartialEq, Eq)]
76pub struct InvolvedEntry {
77    /// Entity name (must match a registry entity referenced in the case).
78    pub entity_name: String,
79    /// NULID for the `involved_in` relationship (auto-generated on first build).
80    pub id: Option<String>,
81    /// Line number (1-indexed) where this entry appears in the original file.
82    pub line: usize,
83}
84
85/// A raw H2 section with its heading text and body content.
86#[derive(Debug)]
87pub struct Section {
88    pub kind: SectionKind,
89    pub body: String,
90    /// Line number (1-indexed) where the H2 heading appears in the original file.
91    pub line: usize,
92}
93
94/// The type of an H2 section, mapped from heading text.
95#[derive(Debug, Clone, Copy, PartialEq, Eq)]
96pub enum SectionKind {
97    People,
98    Organizations,
99    Events,
100    Documents,
101    Assets,
102    Relationships,
103    Timeline,
104    RelatedCases,
105    Involved,
106}
107
108impl SectionKind {
109    fn from_heading(heading: &str) -> Option<Self> {
110        match heading.trim() {
111            s if s.eq_ignore_ascii_case("People") => Some(Self::People),
112            s if s.eq_ignore_ascii_case("Organizations") => Some(Self::Organizations),
113            s if s.eq_ignore_ascii_case("Events") => Some(Self::Events),
114            s if s.eq_ignore_ascii_case("Documents") => Some(Self::Documents),
115            s if s.eq_ignore_ascii_case("Assets") => Some(Self::Assets),
116            s if s.eq_ignore_ascii_case("Relationships") => Some(Self::Relationships),
117            s if s.eq_ignore_ascii_case("Timeline") => Some(Self::Timeline),
118            s if s.eq_ignore_ascii_case("Related Cases") => Some(Self::RelatedCases),
119            s if s.eq_ignore_ascii_case("Involved") => Some(Self::Involved),
120            _ => None,
121        }
122    }
123
124    /// Whether this section kind is valid in case files.
125    /// People and Organizations are no longer allowed in case files.
126    pub fn is_case_section(self) -> bool {
127        matches!(
128            self,
129            Self::Events
130                | Self::Documents
131                | Self::Assets
132                | Self::Relationships
133                | Self::Timeline
134                | Self::RelatedCases
135                | Self::Involved
136        )
137    }
138}
139
140/// A parser error with file location.
141#[derive(Debug)]
142pub struct ParseError {
143    pub line: usize,
144    pub message: String,
145}
146
147impl fmt::Display for ParseError {
148    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
149        write!(f, "line {}: {}", self.line, self.message)
150    }
151}
152
153/// Maximum number of tags per case file.
154const MAX_CASE_TAGS: usize = 10;
155
156/// Maximum number of tags per entity file.
157const MAX_ENTITY_TAGS: usize = 5;
158
159/// Maximum length of a single tag.
160const MAX_TAG_LEN: usize = 50;
161
162/// Maximum number of related case entries per case file.
163const MAX_RELATED_CASES: usize = 10;
164
165/// Maximum length of a related case description.
166const MAX_RELATED_DESCRIPTION_LEN: usize = 500;
167
168/// Parse the body of a `## Related Cases` section into `RelatedCase` entries.
169///
170/// Each entry is a bullet `- <case_path>` followed by indented fields:
171/// `description: <text>` (required) and `id: <NULID>` (optional, written back).
172pub fn parse_related_cases(
173    body: &str,
174    section_start_line: usize,
175    errors: &mut Vec<ParseError>,
176) -> Vec<RelatedCase> {
177    let mut entries: Vec<(String, String, Option<String>, usize)> = Vec::new(); // (path, desc, id, line)
178
179    for (offset, line) in body.lines().enumerate() {
180        let file_line = section_start_line + offset + 1;
181
182        if let Some(rest) = line.strip_prefix("- ") {
183            let case_path = rest.trim().to_string();
184            entries.push((case_path, String::new(), None, file_line));
185        } else if let Some(rest) = line.strip_prefix("  description: ") {
186            if let Some(entry) = entries.last_mut() {
187                entry.1 = rest.trim().to_string();
188            } else {
189                errors.push(ParseError {
190                    line: file_line,
191                    message: "description without a preceding case path".into(),
192                });
193            }
194        } else if let Some(rest) = line.strip_prefix("  id: ") {
195            if let Some(entry) = entries.last_mut() {
196                entry.2 = Some(rest.trim().to_string());
197            } else {
198                errors.push(ParseError {
199                    line: file_line,
200                    message: "id without a preceding case path".into(),
201                });
202            }
203        } else if !line.trim().is_empty() {
204            errors.push(ParseError {
205                line: file_line,
206                message: format!("unexpected line in Related Cases: {line}"),
207            });
208        }
209    }
210
211    if entries.len() > MAX_RELATED_CASES {
212        errors.push(ParseError {
213            line: section_start_line,
214            message: format!(
215                "Related Cases exceeds {MAX_RELATED_CASES} entries (got {})",
216                entries.len()
217            ),
218        });
219    }
220
221    let mut result = Vec::new();
222    for (case_path, description, id, line) in entries {
223        if case_path.is_empty() {
224            errors.push(ParseError {
225                line,
226                message: "related case path must not be empty".into(),
227            });
228            continue;
229        }
230        if description.is_empty() {
231            errors.push(ParseError {
232                line,
233                message: format!("related case {case_path:?} missing description"),
234            });
235            continue;
236        }
237        if description.len() > MAX_RELATED_DESCRIPTION_LEN {
238            errors.push(ParseError {
239                line,
240                message: format!(
241                    "related case description exceeds {MAX_RELATED_DESCRIPTION_LEN} chars (got {})",
242                    description.len()
243                ),
244            });
245            continue;
246        }
247        result.push(RelatedCase {
248            case_path,
249            description,
250            id,
251            line,
252        });
253    }
254
255    result
256}
257
258/// Maximum number of entries in `## Involved` section.
259const MAX_INVOLVED: usize = 50;
260
261/// Parse the body of a `## Involved` section into `InvolvedEntry` items.
262///
263/// Format:
264/// ```text
265/// - Entity Name
266///   id: 01ABC...
267/// ```
268pub fn parse_involved(
269    body: &str,
270    section_start_line: usize,
271    errors: &mut Vec<ParseError>,
272) -> Vec<InvolvedEntry> {
273    let mut entries = Vec::new();
274    let lines: Vec<&str> = body.lines().collect();
275
276    let mut i = 0;
277    while i < lines.len() {
278        let file_line = section_start_line + 1 + i;
279        let trimmed = lines[i].trim();
280
281        if trimmed.is_empty() {
282            i += 1;
283            continue;
284        }
285
286        let Some(name) = trimmed.strip_prefix("- ") else {
287            errors.push(ParseError {
288                line: file_line,
289                message: format!("expected involved entry `- Entity Name`, got {trimmed:?}"),
290            });
291            i += 1;
292            continue;
293        };
294
295        let entity_name = name.trim().to_string();
296        if entity_name.is_empty() {
297            errors.push(ParseError {
298                line: file_line,
299                message: "involved entity name must not be empty".into(),
300            });
301            i += 1;
302            continue;
303        }
304
305        // Look ahead for `id:` on the next line
306        let mut id: Option<String> = None;
307        if i + 1 < lines.len() {
308            let next = lines[i + 1].trim();
309            if let Some(id_val) = next.strip_prefix("id: ") {
310                id = Some(id_val.trim().to_string());
311                i += 1;
312            }
313        }
314
315        entries.push(InvolvedEntry {
316            entity_name,
317            id,
318            line: file_line,
319        });
320
321        i += 1;
322    }
323
324    if entries.len() > MAX_INVOLVED {
325        errors.push(ParseError {
326            line: section_start_line,
327            message: format!(
328                "Involved exceeds {MAX_INVOLVED} entries (got {})",
329                entries.len()
330            ),
331        });
332    }
333
334    entries
335}
336
337/// YAML front matter schema.
338#[derive(Deserialize)]
339struct FrontMatter {
340    /// NULID for the case node (auto-generated on first build).
341    #[serde(default)]
342    id: Option<String>,
343    #[serde(default)]
344    sources: Vec<SourceEntry>,
345    #[serde(default)]
346    case_type: Option<String>,
347    #[serde(default)]
348    status: Option<String>,
349    #[serde(default)]
350    amounts: Option<String>,
351    #[serde(default)]
352    tags: Vec<String>,
353    #[serde(default)]
354    tagline: Option<String>,
355}
356/// structured objects with metadata.
357#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
358#[serde(untagged)]
359pub enum SourceEntry {
360    /// Plain URL string (backward-compatible).
361    Url(String),
362    /// Structured source with metadata.
363    Structured {
364        url: String,
365        #[serde(default)]
366        title: Option<String>,
367        #[serde(default)]
368        published_at: Option<String>,
369        #[serde(default)]
370        language: Option<String>,
371    },
372}
373
374impl SourceEntry {
375    /// Get the URL from any source entry variant.
376    pub fn url(&self) -> &str {
377        match self {
378            Self::Url(u) => u,
379            Self::Structured { url, .. } => url,
380        }
381    }
382}
383
384/// YAML front matter schema for standalone entity files.
385/// Only contains an optional `id` field (NULID, generated on first build).
386#[derive(Deserialize)]
387struct EntityFrontMatter {
388    #[serde(default)]
389    id: Option<String>,
390    #[serde(default)]
391    tags: Vec<String>,
392}
393
394/// A parsed standalone entity file (actor or institution).
395#[derive(Debug)]
396pub struct ParsedEntityFile {
397    /// Stored NULID from front matter (None if not yet generated).
398    pub id: Option<String>,
399    /// Entity name from H1 heading.
400    pub name: String,
401    /// Raw bullet field lines (body after H1, no sections).
402    pub body: String,
403    /// Line number of the H1 heading in the original file.
404    pub title_line: usize,
405    /// Tags from front matter.
406    pub tags: Vec<String>,
407}
408
409/// Parse a Markdown case file into a `ParsedCase`.
410///
411/// Extracts YAML front matter, H1 title, summary, and H2 sections.
412/// Returns errors for malformed structure or boundary violations.
413pub fn parse(input: &str) -> Result<ParsedCase, Vec<ParseError>> {
414    let mut errors = Vec::new();
415
416    // Extract front matter
417    let (front_matter, body_start_line, body) = extract_front_matter(input, &mut errors);
418
419    let Some(front_matter) = front_matter else {
420        if errors.is_empty() {
421            errors.push(ParseError {
422                line: 1,
423                message: "missing YAML front matter (expected `---` delimiter)".into(),
424            });
425        }
426        return Err(errors);
427    };
428
429    // Validate front matter fields
430    validate_front_matter(&front_matter, &mut errors);
431
432    // Extract title, summary, and sections from body
433    let (title, summary, mut sections) = extract_body(&body, body_start_line, &mut errors);
434
435    // Parse Related Cases sections
436    let mut related_cases = Vec::new();
437    for section in &sections {
438        if section.kind == SectionKind::RelatedCases {
439            let entries = parse_related_cases(&section.body, section.line, &mut errors);
440            related_cases.extend(entries);
441        }
442    }
443    // Remove RelatedCases from sections list (consumed)
444    sections.retain(|s| s.kind != SectionKind::RelatedCases);
445
446    // Parse Involved sections
447    let mut involved = Vec::new();
448    for section in &sections {
449        if section.kind == SectionKind::Involved {
450            let entries = parse_involved(&section.body, section.line, &mut errors);
451            involved.extend(entries);
452        }
453    }
454    // Remove Involved from sections list (consumed)
455    sections.retain(|s| s.kind != SectionKind::Involved);
456
457    if !errors.is_empty() {
458        return Err(errors);
459    }
460
461    Ok(ParsedCase {
462        id: front_matter.id,
463        sources: front_matter.sources,
464        title,
465        summary,
466        sections,
467        case_type: front_matter.case_type,
468        status: front_matter.status,
469        amounts: front_matter.amounts,
470        tags: front_matter.tags,
471        tagline: front_matter.tagline,
472        related_cases,
473        involved,
474    })
475}
476
477/// Parse a standalone entity file (actor or institution).
478///
479/// Entity files have YAML front matter with optional `id:`, an H1 name,
480/// and bullet fields directly in the body. No H2 sections are allowed.
481pub fn parse_entity_file(input: &str) -> Result<ParsedEntityFile, Vec<ParseError>> {
482    let mut errors = Vec::new();
483
484    let (front_matter, body_start_line, body) = extract_entity_front_matter(input, &mut errors);
485
486    let id = front_matter.as_ref().and_then(|fm| fm.id.clone());
487    let tags = front_matter.map_or_else(Vec::new, |fm| fm.tags);
488
489    // Validate entity tags
490    if tags.len() > MAX_ENTITY_TAGS {
491        errors.push(ParseError {
492            line: 2,
493            message: format!(
494                "front matter `tags` exceeds {MAX_ENTITY_TAGS} entries (got {})",
495                tags.len()
496            ),
497        });
498    }
499    for (i, tag) in tags.iter().enumerate() {
500        if tag.len() > MAX_TAG_LEN {
501            errors.push(ParseError {
502                line: 2,
503                message: format!("front matter tag #{} exceeds {MAX_TAG_LEN} chars", i + 1),
504            });
505        }
506        if tag.is_empty() {
507            errors.push(ParseError {
508                line: 2,
509                message: format!("front matter tag #{} is empty", i + 1),
510            });
511        }
512    }
513
514    // Extract H1 title and body content (no sections allowed)
515    let (name, title_line, field_body) = extract_entity_body(&body, body_start_line, &mut errors);
516
517    if !errors.is_empty() {
518        return Err(errors);
519    }
520
521    Ok(ParsedEntityFile {
522        id,
523        name,
524        body: field_body,
525        title_line,
526        tags,
527    })
528}
529
530/// Extract YAML front matter for entity files.
531/// Front matter is optional for entity files -- if absent, returns None with no error.
532fn extract_entity_front_matter(
533    input: &str,
534    errors: &mut Vec<ParseError>,
535) -> (Option<EntityFrontMatter>, usize, String) {
536    let lines: Vec<&str> = input.lines().collect();
537
538    let first_delim = lines.iter().position(|l| l.trim() == "---");
539    if first_delim != Some(0) {
540        // No front matter -- entire file is body, starting at line 1
541        return (None, 1, input.to_string());
542    }
543
544    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
545    let Some(close_offset) = close_delim else {
546        errors.push(ParseError {
547            line: 1,
548            message: "unclosed YAML front matter (missing closing `---`)".into(),
549        });
550        return (None, 1, String::new());
551    };
552
553    let close_line = close_offset + 1;
554    let yaml_str: String = lines[1..close_line].join("\n");
555    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
556    let body = lines[close_line + 1..].join("\n");
557
558    match serde_yaml::from_str::<EntityFrontMatter>(&yaml_str) {
559        Ok(fm) => (Some(fm), body_start_line, body),
560        Err(e) => {
561            errors.push(ParseError {
562                line: 2,
563                message: format!("invalid YAML front matter: {e}"),
564            });
565            (None, body_start_line, body)
566        }
567    }
568}
569
570/// Extract H1 name and field body from an entity file.
571/// Rejects any H2 sections.
572fn extract_entity_body(
573    body: &str,
574    body_start_line: usize,
575    errors: &mut Vec<ParseError>,
576) -> (String, usize, String) {
577    let lines: Vec<&str> = body.lines().collect();
578    let mut name = String::new();
579    let mut title_found = false;
580    let mut title_line = body_start_line;
581    let mut field_lines: Vec<&str> = Vec::new();
582
583    for (i, line) in lines.iter().enumerate() {
584        let file_line = body_start_line + i;
585
586        if let Some(heading) = strip_heading(line, 1) {
587            if title_found {
588                errors.push(ParseError {
589                    line: file_line,
590                    message: "multiple H1 headings found (expected exactly one)".into(),
591                });
592                continue;
593            }
594            name = heading.to_string();
595            title_found = true;
596            title_line = file_line;
597            continue;
598        }
599
600        // Reject H2 sections in entity files
601        if strip_heading(line, 2).is_some() {
602            errors.push(ParseError {
603                line: file_line,
604                message: "H2 sections are not allowed in entity files".into(),
605            });
606            continue;
607        }
608
609        if title_found {
610            field_lines.push(line);
611        } else if !line.trim().is_empty() {
612            errors.push(ParseError {
613                line: file_line,
614                message: "expected H1 heading (# Name)".into(),
615            });
616        }
617    }
618
619    if !title_found {
620        errors.push(ParseError {
621            line: body_start_line,
622            message: "missing H1 heading".into(),
623        });
624    } else if name.len() > MAX_TITLE_LEN {
625        errors.push(ParseError {
626            line: title_line,
627            message: format!("H1 name exceeds {MAX_TITLE_LEN} chars (got {})", name.len()),
628        });
629    }
630
631    (name, title_line, field_lines.join("\n"))
632}
633
634/// Extract YAML front matter delimited by `---` lines.
635/// Returns the parsed front matter, the line number where the body starts,
636/// and the body text.
637fn extract_front_matter(
638    input: &str,
639    errors: &mut Vec<ParseError>,
640) -> (Option<FrontMatter>, usize, String) {
641    let lines: Vec<&str> = input.lines().collect();
642
643    // First non-empty line must be `---`
644    let first_delim = lines.iter().position(|l| l.trim() == "---");
645    if first_delim != Some(0) {
646        errors.push(ParseError {
647            line: 1,
648            message: "missing YAML front matter (expected `---` on first line)".into(),
649        });
650        return (None, 1, input.to_string());
651    }
652
653    // Find closing `---`
654    let close_delim = lines[1..].iter().position(|l| l.trim() == "---");
655    let Some(close_offset) = close_delim else {
656        errors.push(ParseError {
657            line: 1,
658            message: "unclosed YAML front matter (missing closing `---`)".into(),
659        });
660        return (None, 1, String::new());
661    };
662
663    let close_line = close_offset + 1; // index in `lines`
664    let yaml_str: String = lines[1..close_line].join("\n");
665    let body_start_line = close_line + 2; // 1-indexed line number after closing `---`
666    let body = lines[close_line + 1..].join("\n");
667
668    match serde_yaml::from_str::<FrontMatter>(&yaml_str) {
669        Ok(fm) => (Some(fm), body_start_line, body),
670        Err(e) => {
671            errors.push(ParseError {
672                line: 2,
673                message: format!("invalid YAML front matter: {e}"),
674            });
675            (None, body_start_line, body)
676        }
677    }
678}
679
680fn validate_front_matter(fm: &FrontMatter, errors: &mut Vec<ParseError>) {
681    // Validate case ID (NULID) if present
682    if let Some(id) = &fm.id
683        && id.len() != MAX_CASE_ID_LEN
684    {
685        errors.push(ParseError {
686            line: 2,
687            message: format!(
688                "front matter `id` must be a {MAX_CASE_ID_LEN}-char NULID, got {} chars",
689                id.len()
690            ),
691        });
692    }
693
694    // Validate sources count
695    if fm.sources.len() > MAX_SOURCES {
696        errors.push(ParseError {
697            line: 2,
698            message: format!(
699                "front matter `sources` exceeds {MAX_SOURCES} entries (got {})",
700                fm.sources.len()
701            ),
702        });
703    }
704
705    // Validate each source URL is HTTPS
706    for (i, source) in fm.sources.iter().enumerate() {
707        if !source.url().starts_with("https://") {
708            errors.push(ParseError {
709                line: 2,
710                message: format!("source[{i}] must be HTTPS, got {:?}", source.url()),
711            });
712        }
713    }
714
715    // Validate case_type
716    if let Some(ct) = &fm.case_type {
717        use crate::domain::CaseType;
718        let normalized = ct.to_lowercase().replace(' ', "_");
719        if !CaseType::KNOWN.contains(&normalized.as_str())
720            && crate::domain::parse_custom(ct).is_none()
721        {
722            errors.push(ParseError {
723                line: 2,
724                message: format!(
725                    "invalid case_type {:?} (known: {}; use \"custom:Value\" for custom)",
726                    ct,
727                    CaseType::KNOWN.join(", ")
728                ),
729            });
730        }
731    }
732
733    // Validate status
734    if let Some(st) = &fm.status {
735        use crate::domain::CaseStatus;
736        let normalized = st.to_lowercase().replace(' ', "_");
737        if !CaseStatus::KNOWN.contains(&normalized.as_str()) {
738            errors.push(ParseError {
739                line: 2,
740                message: format!(
741                    "invalid status {:?} (known: {})",
742                    st,
743                    CaseStatus::KNOWN.join(", ")
744                ),
745            });
746        }
747    }
748
749    // Validate tags
750    if fm.tags.len() > MAX_CASE_TAGS {
751        errors.push(ParseError {
752            line: 2,
753            message: format!(
754                "front matter `tags` exceeds {MAX_CASE_TAGS} entries (got {})",
755                fm.tags.len()
756            ),
757        });
758    }
759    for (i, tag) in fm.tags.iter().enumerate() {
760        if tag.len() > MAX_TAG_LEN {
761            errors.push(ParseError {
762                line: 2,
763                message: format!("tag[{i}] exceeds {MAX_TAG_LEN} chars (got {})", tag.len()),
764            });
765        }
766        if tag.is_empty() {
767            errors.push(ParseError {
768                line: 2,
769                message: format!("tag[{i}] must not be empty"),
770            });
771        }
772    }
773
774    // Validate tagline
775    if let Some(tl) = &fm.tagline {
776        if tl.len() > MAX_TAGLINE_LEN {
777            errors.push(ParseError {
778                line: 2,
779                message: format!(
780                    "tagline exceeds {MAX_TAGLINE_LEN} chars (got {})",
781                    tl.len()
782                ),
783            });
784        }
785        if tl.trim().is_empty() {
786            errors.push(ParseError {
787                line: 2,
788                message: "tagline must not be empty".to_string(),
789            });
790        }
791    }
792}
793
794/// Extract the H1 title, summary text, and H2 sections from the body.
795#[allow(clippy::too_many_lines)]
796fn extract_body(
797    body: &str,
798    body_start_line: usize,
799    errors: &mut Vec<ParseError>,
800) -> (String, String, Vec<Section>) {
801    let lines: Vec<&str> = body.lines().collect();
802    let mut title = String::new();
803    let mut title_found = false;
804    let mut summary_lines: Vec<&str> = Vec::new();
805    let mut sections: Vec<Section> = Vec::new();
806
807    // Track current H2 section being built
808    let mut current_section_kind: Option<SectionKind> = None;
809    let mut current_section_line: usize = 0;
810    let mut current_section_body: Vec<&str> = Vec::new();
811
812    // State: before H1, after H1 (summary), in sections
813    let mut state = State::BeforeTitle;
814
815    for (i, line) in lines.iter().enumerate() {
816        let file_line = body_start_line + i; // 1-indexed line in original file
817
818        if let Some(heading) = strip_heading(line, 1) {
819            if title_found {
820                errors.push(ParseError {
821                    line: file_line,
822                    message: "multiple H1 headings found (expected exactly one)".into(),
823                });
824                continue;
825            }
826            title = heading.to_string();
827            title_found = true;
828            state = State::Summary;
829            continue;
830        }
831
832        if let Some(heading) = strip_heading(line, 2) {
833            // Flush previous section
834            if let Some(kind) = current_section_kind.take() {
835                sections.push(Section {
836                    kind,
837                    body: current_section_body.join("\n"),
838                    line: current_section_line,
839                });
840                current_section_body.clear();
841            }
842
843            match SectionKind::from_heading(heading) {
844                Some(kind) if kind.is_case_section() => {
845                    // Check for duplicate sections
846                    if sections.iter().any(|s| s.kind == kind) {
847                        errors.push(ParseError {
848                            line: file_line,
849                            message: format!("duplicate section: ## {heading}"),
850                        });
851                    }
852                    current_section_kind = Some(kind);
853                    current_section_line = file_line;
854                    state = State::InSection;
855                }
856                Some(_) => {
857                    // Legacy section (People/Organizations) -- not allowed in case files
858                    errors.push(ParseError {
859                        line: file_line,
860                        message: format!(
861                            "## {heading} is not allowed in case files (use standalone entity files in people/ or organizations/ instead)"
862                        ),
863                    });
864                }
865                None => {
866                    errors.push(ParseError {
867                        line: file_line,
868                        message: format!(
869                            "unknown section: ## {heading} (expected one of: {})",
870                            KNOWN_CASE_SECTIONS.join(", ")
871                        ),
872                    });
873                }
874            }
875            continue;
876        }
877
878        match state {
879            State::BeforeTitle => {
880                // Skip blank lines before title
881                if !line.trim().is_empty() {
882                    errors.push(ParseError {
883                        line: file_line,
884                        message: "expected H1 title (# Title)".into(),
885                    });
886                }
887            }
888            State::Summary => {
889                summary_lines.push(line);
890            }
891            State::InSection => {
892                current_section_body.push(line);
893            }
894        }
895    }
896
897    // Flush last section
898    if let Some(kind) = current_section_kind.take() {
899        sections.push(Section {
900            kind,
901            body: current_section_body.join("\n"),
902            line: current_section_line,
903        });
904    }
905
906    // Validate title
907    if !title_found {
908        errors.push(ParseError {
909            line: body_start_line,
910            message: "missing H1 title".into(),
911        });
912    } else if title.len() > MAX_TITLE_LEN {
913        errors.push(ParseError {
914            line: body_start_line,
915            message: format!(
916                "H1 title exceeds {MAX_TITLE_LEN} chars (got {})",
917                title.len()
918            ),
919        });
920    }
921
922    // Build summary (trim leading/trailing blank lines)
923    let summary = summary_lines.clone().join("\n").trim().to_string();
924
925    if summary.len() > MAX_SUMMARY_LEN {
926        errors.push(ParseError {
927            line: body_start_line,
928            message: format!(
929                "summary exceeds {MAX_SUMMARY_LEN} chars (got {})",
930                summary.len()
931            ),
932        });
933    }
934
935    (title, summary, sections)
936}
937
938#[derive(Clone, Copy)]
939enum State {
940    BeforeTitle,
941    Summary,
942    InSection,
943}
944
945/// Strip an ATX heading prefix of the given level. Returns the heading text.
946/// E.g., `strip_heading("## Foo", 2)` returns `Some("Foo")`.
947fn strip_heading(line: &str, level: usize) -> Option<&str> {
948    let prefix = "#".repeat(level);
949    let trimmed = line.trim_start();
950    if trimmed.starts_with(&prefix) {
951        let after = &trimmed[prefix.len()..];
952        // Must be followed by space or end of line, and NOT more `#` chars
953        if after.is_empty() {
954            return Some("");
955        }
956        if after.starts_with(' ') && !after.starts_with(" #") {
957            // Actually, need to exclude `### Foo` when looking for `## Foo`
958            return Some(after[1..].trim());
959        }
960        // Check: `###` should not match `##`
961        if after.starts_with('#') {
962            return None;
963        }
964    }
965    None
966}
967
968#[cfg(test)]
969mod tests {
970    use super::*;
971
972    fn minimal_case() -> String {
973        [
974            "---",
975            "id: 01H9XT7H1J3929RK32FWSRKV88",
976            "sources:",
977            "  - https://example.com/source",
978            "---",
979            "",
980            "# Test Case Title",
981            "",
982            "This is the summary.",
983            "",
984            "## Events",
985            "",
986            "### Something happened",
987            "- occurred_at: 2025-01-01",
988            "",
989            "## Relationships",
990            "",
991            "- Something happened -> Something happened: associate_of",
992        ]
993        .join("\n")
994    }
995
996    #[test]
997    fn parse_minimal_case() {
998        let result = parse(&minimal_case());
999        let case = result.unwrap_or_else(|errs| {
1000            panic!(
1001                "parse failed: {}",
1002                errs.iter()
1003                    .map(ToString::to_string)
1004                    .collect::<Vec<_>>()
1005                    .join("; ")
1006            );
1007        });
1008
1009        assert_eq!(case.id.as_deref(), Some("01H9XT7H1J3929RK32FWSRKV88"));
1010        assert_eq!(case.sources.len(), 1);
1011        assert_eq!(case.sources[0].url(), "https://example.com/source");
1012        assert_eq!(case.title, "Test Case Title");
1013        assert_eq!(case.summary, "This is the summary.");
1014        assert_eq!(case.sections.len(), 2);
1015        assert_eq!(case.sections[0].kind, SectionKind::Events);
1016        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
1017    }
1018
1019    #[test]
1020    fn parse_missing_front_matter() {
1021        let input = "# Title\n\nSummary.\n";
1022        let errs = parse(input).unwrap_err();
1023        assert!(errs.iter().any(|e| e.message.contains("front matter")));
1024    }
1025
1026    #[test]
1027    fn parse_unclosed_front_matter() {
1028        let input = "---\nsources: []\n# Title\n";
1029        let errs = parse(input).unwrap_err();
1030        assert!(errs.iter().any(|e| e.message.contains("unclosed")));
1031    }
1032
1033    #[test]
1034    fn parse_invalid_case_id_wrong_length() {
1035        let input = "---\nid: short\nsources: []\n---\n\n# Title\n";
1036        let errs = parse(input).unwrap_err();
1037        assert!(errs.iter().any(|e| e.message.contains("NULID")));
1038    }
1039
1040    #[test]
1041    fn parse_case_id_absent_is_ok() {
1042        let input = "---\nsources:\n  - https://example.com\n---\n\n# Title\n\nSummary.\n";
1043        let case = parse(input).unwrap();
1044        assert!(case.id.is_none());
1045    }
1046
1047    #[test]
1048    fn parse_non_https_source() {
1049        let input = "---\nsources:\n  - http://example.com\n---\n\n# Title\n";
1050        let errs = parse(input).unwrap_err();
1051        assert!(errs.iter().any(|e| e.message.contains("HTTPS")));
1052    }
1053
1054    #[test]
1055    fn parse_too_many_sources() {
1056        let sources: Vec<String> = (0..21)
1057            .map(|i| format!("  - https://example.com/{i}"))
1058            .collect();
1059        let input = format!("---\nsources:\n{}\n---\n\n# Title\n", sources.join("\n"));
1060        let errs = parse(&input).unwrap_err();
1061        assert!(errs.iter().any(|e| e.message.contains("exceeds 20")));
1062    }
1063
1064    #[test]
1065    fn parse_unknown_section() {
1066        let input = [
1067            "---",
1068            "sources: []",
1069            "---",
1070            "",
1071            "# Title",
1072            "",
1073            "## Unknown Section",
1074            "",
1075        ]
1076        .join("\n");
1077        let errs = parse(&input).unwrap_err();
1078        assert!(errs.iter().any(|e| e.message.contains("unknown section")));
1079    }
1080
1081    #[test]
1082    fn parse_duplicate_section() {
1083        let input = [
1084            "---",
1085            "sources: []",
1086            "---",
1087            "",
1088            "# Title",
1089            "",
1090            "## Events",
1091            "",
1092            "## Events",
1093            "",
1094        ]
1095        .join("\n");
1096        let errs = parse(&input).unwrap_err();
1097        assert!(errs.iter().any(|e| e.message.contains("duplicate")));
1098    }
1099
1100    #[test]
1101    fn parse_multiple_h1() {
1102        let input = [
1103            "---",
1104            "sources: []",
1105            "---",
1106            "",
1107            "# First Title",
1108            "",
1109            "# Second Title",
1110            "",
1111        ]
1112        .join("\n");
1113        let errs = parse(&input).unwrap_err();
1114        assert!(errs.iter().any(|e| e.message.contains("multiple H1")));
1115    }
1116
1117    #[test]
1118    fn parse_all_sections() {
1119        let input = [
1120            "---",
1121            "id: 01H9XT7H1KRQ9SJ7SD9ETB5CVQ",
1122            "sources:",
1123            "  - https://example.com/a",
1124            "---",
1125            "",
1126            "# Full Case",
1127            "",
1128            "Summary text here.",
1129            "",
1130            "## Events",
1131            "",
1132            "### Something happened",
1133            "- occurred_at: 2025-01-01",
1134            "",
1135            "## Relationships",
1136            "",
1137            "- Alice -> Corp Inc: employed_by",
1138            "",
1139            "## Timeline",
1140            "",
1141            "Something happened",
1142        ]
1143        .join("\n");
1144
1145        let case = parse(&input).unwrap_or_else(|errs| {
1146            panic!(
1147                "parse failed: {}",
1148                errs.iter()
1149                    .map(ToString::to_string)
1150                    .collect::<Vec<_>>()
1151                    .join("; ")
1152            );
1153        });
1154
1155        assert_eq!(case.id.as_deref(), Some("01H9XT7H1KRQ9SJ7SD9ETB5CVQ"));
1156        assert_eq!(case.title, "Full Case");
1157        assert_eq!(case.summary, "Summary text here.");
1158        assert_eq!(case.sections.len(), 3);
1159        assert_eq!(case.sections[0].kind, SectionKind::Events);
1160        assert_eq!(case.sections[1].kind, SectionKind::Relationships);
1161        assert_eq!(case.sections[2].kind, SectionKind::Timeline);
1162    }
1163
1164    #[test]
1165    fn parse_empty_summary() {
1166        let input = [
1167            "---",
1168            "sources: []",
1169            "---",
1170            "",
1171            "# Title",
1172            "",
1173            "## Events",
1174            "",
1175        ]
1176        .join("\n");
1177
1178        let case = parse(&input).unwrap_or_else(|errs| {
1179            panic!(
1180                "parse failed: {}",
1181                errs.iter()
1182                    .map(ToString::to_string)
1183                    .collect::<Vec<_>>()
1184                    .join("; ")
1185            );
1186        });
1187        assert_eq!(case.summary, "");
1188    }
1189
1190    #[test]
1191    fn parse_multiline_summary() {
1192        let input = [
1193            "---",
1194            "sources: []",
1195            "---",
1196            "",
1197            "# Title",
1198            "",
1199            "First line of summary.",
1200            "Second line of summary.",
1201            "",
1202            "## Events",
1203            "",
1204        ]
1205        .join("\n");
1206
1207        let case = parse(&input).unwrap_or_else(|errs| {
1208            panic!(
1209                "parse failed: {}",
1210                errs.iter()
1211                    .map(ToString::to_string)
1212                    .collect::<Vec<_>>()
1213                    .join("; ")
1214            );
1215        });
1216        assert_eq!(
1217            case.summary,
1218            "First line of summary.\nSecond line of summary."
1219        );
1220    }
1221
1222    #[test]
1223    fn strip_heading_levels() {
1224        assert_eq!(strip_heading("# Title", 1), Some("Title"));
1225        assert_eq!(strip_heading("## Section", 2), Some("Section"));
1226        assert_eq!(strip_heading("### Entity", 3), Some("Entity"));
1227        // H3 should not match H2
1228        assert_eq!(strip_heading("### Entity", 2), None);
1229        // H2 should not match H1
1230        assert_eq!(strip_heading("## Section", 1), None);
1231        // Not a heading
1232        assert_eq!(strip_heading("Normal text", 1), None);
1233    }
1234
1235    #[test]
1236    fn section_body_content() {
1237        let input = [
1238            "---",
1239            "sources: []",
1240            "---",
1241            "",
1242            "# Title",
1243            "",
1244            "## Events",
1245            "",
1246            "### Bonnick dismissal",
1247            "- occurred_at: 2024-12-24",
1248            "- type: termination",
1249            "",
1250        ]
1251        .join("\n");
1252
1253        let case = parse(&input).unwrap_or_else(|errs| {
1254            panic!(
1255                "parse failed: {}",
1256                errs.iter()
1257                    .map(ToString::to_string)
1258                    .collect::<Vec<_>>()
1259                    .join("; ")
1260            );
1261        });
1262
1263        assert_eq!(case.sections.len(), 1);
1264        let body = &case.sections[0].body;
1265        assert!(body.contains("### Bonnick dismissal"));
1266        assert!(body.contains("- occurred_at: 2024-12-24"));
1267    }
1268
1269    #[test]
1270    fn parse_rejects_people_section_in_case_file() {
1271        let input = [
1272            "---",
1273            "sources: []",
1274            "---",
1275            "",
1276            "# Title",
1277            "",
1278            "## People",
1279            "",
1280        ]
1281        .join("\n");
1282        let errs = parse(&input).unwrap_err();
1283        assert!(
1284            errs.iter()
1285                .any(|e| e.message.contains("not allowed in case files"))
1286        );
1287    }
1288
1289    #[test]
1290    fn parse_rejects_organizations_section_in_case_file() {
1291        let input = [
1292            "---",
1293            "sources: []",
1294            "---",
1295            "",
1296            "# Title",
1297            "",
1298            "## Organizations",
1299            "",
1300        ]
1301        .join("\n");
1302        let errs = parse(&input).unwrap_err();
1303        assert!(
1304            errs.iter()
1305                .any(|e| e.message.contains("not allowed in case files"))
1306        );
1307    }
1308
1309    #[test]
1310    fn parse_entity_file_with_id() {
1311        let input = [
1312            "---",
1313            "id: 01JXYZ123456789ABCDEFGHIJK",
1314            "---",
1315            "",
1316            "# Mark Bonnick",
1317            "",
1318            "- qualifier: Arsenal Kit Manager",
1319            "- nationality: British",
1320            "",
1321        ]
1322        .join("\n");
1323
1324        let result = parse_entity_file(&input).unwrap();
1325        assert_eq!(result.id.as_deref(), Some("01JXYZ123456789ABCDEFGHIJK"));
1326        assert_eq!(result.name, "Mark Bonnick");
1327        assert!(result.body.contains("- qualifier: Arsenal Kit Manager"));
1328        assert!(result.body.contains("- nationality: British"));
1329    }
1330
1331    #[test]
1332    fn parse_entity_file_without_id() {
1333        let input = [
1334            "---",
1335            "---",
1336            "",
1337            "# Arsenal FC",
1338            "",
1339            "- qualifier: English Football Club",
1340            "- org_type: sports_club",
1341            "",
1342        ]
1343        .join("\n");
1344
1345        let result = parse_entity_file(&input).unwrap();
1346        assert!(result.id.is_none());
1347        assert_eq!(result.name, "Arsenal FC");
1348    }
1349
1350    #[test]
1351    fn parse_entity_file_no_front_matter() {
1352        let input = ["# Bob Smith", "", "- nationality: Dutch", ""].join("\n");
1353
1354        let result = parse_entity_file(&input).unwrap();
1355        assert!(result.id.is_none());
1356        assert_eq!(result.name, "Bob Smith");
1357        assert!(result.body.contains("- nationality: Dutch"));
1358    }
1359
1360    #[test]
1361    fn parse_entity_file_rejects_h2_sections() {
1362        let input = [
1363            "---",
1364            "---",
1365            "",
1366            "# Test Entity",
1367            "",
1368            "## Relationships",
1369            "",
1370        ]
1371        .join("\n");
1372
1373        let errs = parse_entity_file(&input).unwrap_err();
1374        assert!(errs.iter().any(|e| e.message.contains("H2 sections")));
1375    }
1376
1377    #[test]
1378    fn parse_entity_file_missing_h1() {
1379        let input = ["---", "---", "", "- nationality: Dutch", ""].join("\n");
1380
1381        let errs = parse_entity_file(&input).unwrap_err();
1382        assert!(errs.iter().any(|e| e.message.contains("missing H1")));
1383    }
1384
1385    #[test]
1386    fn parse_related_cases_section() {
1387        let input = [
1388            "---",
1389            "tags: [bribery]",
1390            "sources:",
1391            "  - https://example.com",
1392            "---",
1393            "",
1394            "# Test Case",
1395            "",
1396            "Summary text.",
1397            "",
1398            "## Related Cases",
1399            "",
1400            "- id/corruption/2002/blbi-liquidity-aid-scandal",
1401            "  description: Artalyta bribed Urip to influence the BLBI investigation",
1402            "- id/corruption/2008/another-case",
1403            "  description: A second related case",
1404        ]
1405        .join("\n");
1406
1407        let case = parse(&input).unwrap_or_else(|errs| {
1408            panic!(
1409                "parse failed: {}",
1410                errs.iter()
1411                    .map(ToString::to_string)
1412                    .collect::<Vec<_>>()
1413                    .join("; ")
1414            );
1415        });
1416
1417        assert_eq!(case.related_cases.len(), 2);
1418        assert_eq!(
1419            case.related_cases[0].case_path,
1420            "id/corruption/2002/blbi-liquidity-aid-scandal"
1421        );
1422        assert_eq!(
1423            case.related_cases[0].description,
1424            "Artalyta bribed Urip to influence the BLBI investigation"
1425        );
1426        assert_eq!(
1427            case.related_cases[1].case_path,
1428            "id/corruption/2008/another-case"
1429        );
1430        assert_eq!(case.related_cases[1].description, "A second related case");
1431        // RelatedCases should be consumed and NOT appear in sections
1432        assert!(
1433            !case
1434                .sections
1435                .iter()
1436                .any(|s| s.kind == SectionKind::RelatedCases)
1437        );
1438    }
1439
1440    #[test]
1441    fn parse_related_cases_empty_path() {
1442        let input = [
1443            "---",
1444            "sources: []",
1445            "---",
1446            "",
1447            "# Title",
1448            "",
1449            "## Related Cases",
1450            "",
1451            "- ",
1452            "  description: Some description",
1453        ]
1454        .join("\n");
1455
1456        let errs = parse(&input).unwrap_err();
1457        assert!(
1458            errs.iter()
1459                .any(|e| e.message.contains("case path must not be empty"))
1460        );
1461    }
1462
1463    #[test]
1464    fn parse_related_cases_missing_description() {
1465        let input = [
1466            "---",
1467            "sources: []",
1468            "---",
1469            "",
1470            "# Title",
1471            "",
1472            "## Related Cases",
1473            "",
1474            "- id/corruption/2002/some-case",
1475        ]
1476        .join("\n");
1477
1478        let errs = parse(&input).unwrap_err();
1479        assert!(errs.iter().any(|e| e.message.contains("description")));
1480    }
1481
1482    #[test]
1483    fn parse_related_cases_description_too_long() {
1484        let long_desc = "x".repeat(501);
1485        let input = [
1486            "---",
1487            "sources: []",
1488            "---",
1489            "",
1490            "# Title",
1491            "",
1492            "## Related Cases",
1493            "",
1494            "- id/corruption/2002/some-case",
1495            &format!("  description: {long_desc}"),
1496        ]
1497        .join("\n");
1498
1499        let errs = parse(&input).unwrap_err();
1500        assert!(errs.iter().any(|e| e.message.contains("exceeds 500")));
1501    }
1502
1503    #[test]
1504    fn parse_related_cases_too_many() {
1505        let mut lines = vec![
1506            "---".to_string(),
1507            "sources: []".to_string(),
1508            "---".to_string(),
1509            String::new(),
1510            "# Title".to_string(),
1511            String::new(),
1512            "## Related Cases".to_string(),
1513            String::new(),
1514        ];
1515        for i in 0..11 {
1516            lines.push(format!("- id/corruption/2002/case-{i}"));
1517            lines.push(format!("  description: Description {i}"));
1518        }
1519        let input = lines.join("\n");
1520
1521        let errs = parse(&input).unwrap_err();
1522        assert!(errs.iter().any(|e| e.message.contains("exceeds 10")));
1523    }
1524}
weave_content/parser.rs

weave_content/
parser.rs