Skip to main content

weave_content/
entity.rs

1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5/// Maximum entities per file.
6const MAX_ENTITIES_PER_FILE: usize = 50;
7
8/// Maximum length of an entity name.
9const MAX_NAME_LEN: usize = 300;
10
11/// Label derived from the section an entity appears in.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14    Person,
15    Organization,
16    Event,
17    Document,
18    Asset,
19}
20
21impl fmt::Display for Label {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        match self {
24            Self::Person => write!(f, "person"),
25            Self::Organization => write!(f, "organization"),
26            Self::Event => write!(f, "event"),
27            Self::Document => write!(f, "document"),
28            Self::Asset => write!(f, "asset"),
29        }
30    }
31}
32
33impl Label {
34    pub fn from_section(kind: SectionKind) -> Option<Self> {
35        match kind {
36            SectionKind::People => Some(Self::Person),
37            SectionKind::Organizations => Some(Self::Organization),
38            SectionKind::Events => Some(Self::Event),
39            SectionKind::Documents => Some(Self::Document),
40            SectionKind::Assets => Some(Self::Asset),
41            _ => None,
42        }
43    }
44}
45
46/// A parsed entity with its name, label, and field map.
47#[derive(Debug, Clone)]
48pub struct Entity {
49    pub name: String,
50    pub label: Label,
51    pub fields: Vec<(String, FieldValue)>,
52    /// Stored NULID from `id:` field (None if not yet generated).
53    pub id: Option<String>,
54    /// Line number (1-indexed) of the H3 heading.
55    pub line: usize,
56    /// Tags from front matter (empty for inline entities).
57    pub tags: Vec<String>,
58    /// File-path slug (e.g. `people/id/harvey-moeis`). Only set for
59    /// registry entities that have standalone files.
60    pub slug: Option<String>,
61}
62
63/// A field value: either a single string or a list of strings.
64#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum FieldValue {
66    Single(String),
67    List(Vec<String>),
68}
69
70/// Parse a single entity from a standalone entity file body.
71/// The body is the text after the H1 heading (bullet fields, no H3 headings).
72/// `label` is determined by the file's directory (people/ or organizations/).
73/// `id` comes from the front matter (may be None).
74pub fn parse_entity_file_body(
75    name: &str,
76    body: &str,
77    label: Label,
78    id: Option<String>,
79    title_line: usize,
80    errors: &mut Vec<ParseError>,
81) -> Entity {
82    let section_kind = match label {
83        Label::Person => SectionKind::People,
84        Label::Organization => SectionKind::Organizations,
85        Label::Event => SectionKind::Events,
86        Label::Document => SectionKind::Documents,
87        Label::Asset => SectionKind::Assets,
88    };
89
90    // Wrap the body with a fake H3 heading so we can reuse parse_entities
91    let wrapped = format!("### {name}\n{body}");
92    let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
93
94    if let Some(mut entity) = entities.pop() {
95        entity.id = id;
96        entity.line = title_line;
97        entity
98    } else {
99        Entity {
100            name: name.to_string(),
101            label,
102            fields: Vec::new(),
103            id,
104            line: title_line,
105            tags: Vec::new(),
106            slug: None,
107        }
108    }
109}
110
111/// Parse entities from an entity section (People, Organizations, Events).
112/// The `body` is the text between the H2 heading and the next H2 heading.
113/// `section_start_line` is the line number of the H2 heading in the original file.
114#[allow(clippy::too_many_lines)]
115pub fn parse_entities(
116    body: &str,
117    section_kind: SectionKind,
118    section_start_line: usize,
119    errors: &mut Vec<ParseError>,
120) -> Vec<Entity> {
121    let Some(label) = Label::from_section(section_kind) else {
122        return Vec::new();
123    };
124
125    let lines: Vec<&str> = body.lines().collect();
126    let mut entities: Vec<Entity> = Vec::new();
127    let mut current_name: Option<String> = None;
128    let mut current_line: usize = 0;
129    let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
130    // Track multi-line value continuation and nested list building
131    let mut pending_list_key: Option<String> = None;
132    let mut pending_list_items: Vec<String> = Vec::new();
133
134    for (i, line) in lines.iter().enumerate() {
135        let file_line = section_start_line + 1 + i; // +1 because body starts after the H2 heading line
136
137        // Check for H3 heading
138        if let Some(name) = strip_h3(line) {
139            // Flush pending list
140            flush_pending_list(
141                &mut pending_list_key,
142                &mut pending_list_items,
143                &mut current_fields,
144            );
145
146            // Flush previous entity
147            if let Some(entity_name) = current_name.take() {
148                let entity = build_entity(
149                    entity_name,
150                    label,
151                    current_line,
152                    &mut current_fields,
153                    errors,
154                );
155                entities.push(entity);
156            }
157
158            current_name = Some(name.to_string());
159            current_line = file_line;
160            current_fields.clear();
161            continue;
162        }
163
164        // Only parse bullet fields if we're inside an entity (after an H3)
165        if current_name.is_none() {
166            if !line.trim().is_empty() {
167                errors.push(ParseError {
168                    line: file_line,
169                    message: "content before first entity heading (### Name)".into(),
170                });
171            }
172            continue;
173        }
174
175        let trimmed = line.trim();
176
177        // Nested list item: `  - value` (2-space indent + dash)
178        if let Some(item) = trimmed.strip_prefix("- ") {
179            if line.starts_with("  - ") && pending_list_key.is_some() {
180                // Nested list item for pending list key
181                pending_list_items.push(item.trim().to_string());
182                continue;
183            }
184
185            // Flush pending list before processing new top-level bullet
186            flush_pending_list(
187                &mut pending_list_key,
188                &mut pending_list_items,
189                &mut current_fields,
190            );
191
192            // Top-level bullet: `- key: value` or `- key:`
193            if let Some((key, value)) = parse_bullet(item) {
194                if value.is_empty() {
195                    // Start a nested list: `- urls:`
196                    pending_list_key = Some(key);
197                    pending_list_items.clear();
198                } else if is_list_field(&key) && value.contains(',') {
199                    // Comma-separated list: `- aliases: A, B, C`
200                    let items: Vec<String> = value
201                        .split(',')
202                        .map(|s| s.trim().to_string())
203                        .filter(|s| !s.is_empty())
204                        .collect();
205                    current_fields.push((key, FieldValue::List(items)));
206                } else {
207                    current_fields.push((key, FieldValue::Single(value)));
208                }
209            } else {
210                errors.push(ParseError {
211                    line: file_line,
212                    message: format!(
213                        "invalid field syntax: expected `- key: value`, got {trimmed:?}"
214                    ),
215                });
216            }
217            continue;
218        }
219
220        // Multi-line value continuation (2-space indent, not a bullet)
221        if line.starts_with("  ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
222            if pending_list_key.is_some() {
223                // Could be continuation inside a list context -- treat as error
224                errors.push(ParseError {
225                    line: file_line,
226                    message: "unexpected indented text in list context".into(),
227                });
228            } else if let Some(last) = current_fields.last_mut() {
229                match last.1 {
230                    FieldValue::Single(ref mut val) => {
231                        val.push('\n');
232                        val.push_str(trimmed);
233                    }
234                    FieldValue::List(ref mut items) => {
235                        // Continuation of a comma-separated list field.
236                        // Join last item with continuation text, then re-split
237                        // in case new commas appear.
238                        let tail = items.pop().unwrap_or_default();
239                        let joined = if tail.is_empty() {
240                            trimmed.to_string()
241                        } else {
242                            format!("{tail} {trimmed}")
243                        };
244                        for part in joined.split(',') {
245                            let part = part.trim().to_string();
246                            if !part.is_empty() {
247                                items.push(part);
248                            }
249                        }
250                    }
251                }
252            }
253            continue;
254        }
255
256        // Blank line or other content -- ignore
257        if !trimmed.is_empty() {
258            // Flush pending list on non-indented non-bullet content
259            flush_pending_list(
260                &mut pending_list_key,
261                &mut pending_list_items,
262                &mut current_fields,
263            );
264        }
265    }
266
267    // Flush final pending list and entity
268    flush_pending_list(
269        &mut pending_list_key,
270        &mut pending_list_items,
271        &mut current_fields,
272    );
273
274    if let Some(entity_name) = current_name.take() {
275        let entity = build_entity(
276            entity_name,
277            label,
278            current_line,
279            &mut current_fields,
280            errors,
281        );
282        entities.push(entity);
283    }
284
285    // Boundary check
286    if entities.len() > MAX_ENTITIES_PER_FILE {
287        errors.push(ParseError {
288            line: section_start_line,
289            message: format!(
290                "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
291                entities.len()
292            ),
293        });
294    }
295
296    entities
297}
298
299fn flush_pending_list(
300    pending_key: &mut Option<String>,
301    pending_items: &mut Vec<String>,
302    fields: &mut Vec<(String, FieldValue)>,
303) {
304    if let Some(key) = pending_key.take() {
305        fields.push((key, FieldValue::List(std::mem::take(pending_items))));
306    }
307}
308
309fn build_entity(
310    name: String,
311    label: Label,
312    line: usize,
313    fields: &mut Vec<(String, FieldValue)>,
314    errors: &mut Vec<ParseError>,
315) -> Entity {
316    // Validate name
317    if name.trim().is_empty() {
318        errors.push(ParseError {
319            line,
320            message: "entity name must not be empty".into(),
321        });
322    } else if name.len() > MAX_NAME_LEN {
323        errors.push(ParseError {
324            line,
325            message: format!(
326                "entity name exceeds {MAX_NAME_LEN} chars (got {})",
327                name.len()
328            ),
329        });
330    }
331
332    // Extract id field before validation (not a schema field)
333    let id = extract_id_field(fields);
334
335    // Apply type: shorthand
336    apply_type_shorthand(fields, label);
337
338    // Normalize enum field values (lowercase, spaces → underscores)
339    normalize_enum_fields(fields);
340
341    // Validate fields against schema
342    validate_fields(fields, label, line, errors);
343
344    Entity {
345        name,
346        label,
347        fields: std::mem::take(fields),
348        id,
349        line,
350        tags: Vec::new(),
351        slug: None,
352    }
353}
354
355/// Extract and remove the `id` field from the field list.
356fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
357    let pos = fields.iter().position(|(k, _)| k == "id")?;
358    let (_, value) = fields.remove(pos);
359    match value {
360        FieldValue::Single(s) if !s.is_empty() => Some(s),
361        _ => None,
362    }
363}
364
365/// Replace `type:` shorthand with the label-specific field name.
366fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
367    for field in fields.iter_mut() {
368        if field.0 == "type" {
369            field.0 = match label {
370                Label::Organization => "org_type".to_string(),
371                Label::Event => "event_type".to_string(),
372                Label::Document => "doc_type".to_string(),
373                Label::Asset => "asset_type".to_string(),
374                Label::Person => "type".to_string(), // will be caught as unknown
375            };
376        }
377    }
378}
379
380/// Parse `key: value` from a bullet item (after stripping `- `).
381fn parse_bullet(item: &str) -> Option<(String, String)> {
382    let colon_pos = item.find(':')?;
383    let key = item[..colon_pos].trim();
384    if key.is_empty() {
385        return None;
386    }
387    let value = item[colon_pos + 1..].trim();
388    Some((key.to_string(), value.to_string()))
389}
390
391/// Check if a field name is a list-type field.
392fn is_list_field(key: &str) -> bool {
393    matches!(key, "aliases" | "urls" | "role")
394}
395
396/// Strip an H3 heading prefix. Returns the heading text.
397fn strip_h3(line: &str) -> Option<&str> {
398    let trimmed = line.trim_start();
399    if let Some(rest) = trimmed.strip_prefix("### ") {
400        // Must not be H4+
401        if !rest.starts_with('#') {
402            return Some(rest.trim());
403        }
404    }
405    None
406}
407
408// --- Field validation ---
409
410/// Known fields per label (common + label-specific).
411const COMMON_FIELDS: &[&str] = &[
412    "qualifier",
413    "aliases",
414    "thumbnail",
415    "thumbnail_source",
416    "urls",
417    "description",
418];
419
420const PERSON_FIELDS: &[&str] = &[
421    "role",
422    "nationality",
423    "date_of_birth",
424    "date_of_death",
425    "place_of_birth",
426    "status",
427];
428
429const ORGANIZATION_FIELDS: &[&str] = &[
430    "org_type",
431    "jurisdiction",
432    "headquarters",
433    "founded_date",
434    "registration_number",
435    "status",
436];
437
438const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
439
440const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
441
442const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
443
444/// Known enum values — delegated to domain module constants.
445use crate::domain;
446
447const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
448const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
449const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
450const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
451const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
452const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
453const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
454const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
455const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
456
457/// Field max lengths.
458struct FieldConstraint {
459    max_len: usize,
460    /// If Some, the field is an enum with these known values.
461    enum_values: Option<&'static [&'static str]>,
462}
463
464fn field_constraint(key: &str) -> Option<FieldConstraint> {
465    match key {
466        "description" => Some(FieldConstraint {
467            max_len: 2000,
468            enum_values: None,
469        }),
470        "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
471            max_len: 2048,
472            enum_values: None,
473        }),
474        "occurred_at" | "date_of_birth" | "date_of_death" | "founded_date" | "issued_at" | "opened_at"
475        | "closed_at" => Some(FieldConstraint {
476            max_len: 10,
477            enum_values: None,
478        }),
479        "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
480            Some(FieldConstraint {
481                max_len: 200,
482                enum_values: None,
483            })
484        }
485        "jurisdiction" => Some(FieldConstraint {
486            // jurisdiction: ID or ID/South Sulawesi (country + optional subdivision)
487            max_len: 203, // 2 + 1 + 200
488            enum_values: None,
489        }),
490        "role" => Some(FieldConstraint {
491            max_len: 100,
492            enum_values: Some(ROLE_VALUES),
493        }),
494        "org_type" => Some(FieldConstraint {
495            max_len: 100,
496            enum_values: Some(ORG_TYPE_VALUES),
497        }),
498        "event_type" => Some(FieldConstraint {
499            max_len: 100,
500            enum_values: Some(EVENT_TYPE_VALUES),
501        }),
502        "doc_type" => Some(FieldConstraint {
503            max_len: 100,
504            enum_values: Some(DOC_TYPE_VALUES),
505        }),
506        "asset_type" => Some(FieldConstraint {
507            max_len: 100,
508            enum_values: Some(ASSET_TYPE_VALUES),
509        }),
510        "severity" => Some(FieldConstraint {
511            max_len: 20,
512            enum_values: Some(SEVERITY_VALUES),
513        }),
514        "status" => Some(FieldConstraint {
515            // Status validation is context-dependent (Person vs Org vs Asset),
516            // handled separately in validate_fields.
517            max_len: 30,
518            enum_values: None,
519        }),
520        "qualifier" | "nationality" | "case_number" | "registration_number" => {
521            Some(FieldConstraint {
522                max_len: 100,
523                enum_values: None,
524            })
525        }
526        // List fields validated separately
527        _ => None,
528    }
529}
530
531/// Maximum items in list fields.
532const MAX_ALIASES: usize = 10;
533const MAX_ALIAS_LEN: usize = 200;
534const MAX_URLS: usize = 10;
535const MAX_URL_LEN: usize = 2048;
536
537/// Normalize enum field values in-place: lowercase and replace spaces with
538/// underscores. Values with the `custom:` prefix are left unchanged.
539/// Handles both single-value and list-value enum fields.
540fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
541    for (key, value) in fields.iter_mut() {
542        let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
543
544        match value {
545            FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
546                let normalized = val.to_lowercase().replace(' ', "_");
547                if normalized != *val {
548                    *val = normalized;
549                }
550            }
551            FieldValue::List(items) if is_enum => {
552                for item in items.iter_mut() {
553                    if !item.starts_with("custom:") {
554                        let normalized = item.to_lowercase().replace(' ', "_");
555                        if normalized != *item {
556                            *item = normalized;
557                        }
558                    }
559                }
560            }
561            _ => {}
562        }
563    }
564}
565
566fn validate_fields(
567    fields: &[(String, FieldValue)],
568    label: Label,
569    line: usize,
570    errors: &mut Vec<ParseError>,
571) {
572    let label_fields: &[&str] = match label {
573        Label::Person => PERSON_FIELDS,
574        Label::Organization => ORGANIZATION_FIELDS,
575        Label::Event => EVENT_FIELDS,
576        Label::Document => DOCUMENT_FIELDS,
577        Label::Asset => ASSET_FIELDS,
578    };
579
580    for (key, value) in fields {
581        if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
582            errors.push(ParseError {
583                line,
584                message: format!("unknown field {key:?} for {label}"),
585            });
586            continue;
587        }
588
589        match value {
590            FieldValue::Single(val) => validate_single_field(key, val, label, line, errors),
591            FieldValue::List(items) => validate_list_field(key, items, line, errors),
592        }
593    }
594
595    // Required field checks
596    if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
597        errors.push(ParseError {
598            line,
599            message: "organization entity missing required field \"org_type\"".into(),
600        });
601    }
602}
603
604/// Validate a single-valued field (length, enum, date, URL, status, jurisdiction, money).
605fn validate_single_field(
606    key: &str,
607    val: &str,
608    label: Label,
609    line: usize,
610    errors: &mut Vec<ParseError>,
611) {
612    if let Some(constraint) = field_constraint(key) {
613        if val.len() > constraint.max_len {
614            errors.push(ParseError {
615                line,
616                message: format!(
617                    "field {key:?} exceeds {} chars (got {})",
618                    constraint.max_len,
619                    val.len()
620                ),
621            });
622        }
623
624        if let Some(allowed) = constraint.enum_values {
625            validate_enum_value(key, val, allowed, line, errors);
626        }
627
628        if matches!(
629            key,
630            "occurred_at"
631                | "date_of_birth"
632                | "date_of_death"
633                | "founded_date"
634                | "issued_at"
635                | "opened_at"
636                | "closed_at"
637        ) && !val.is_empty()
638        {
639            validate_date_format(key, val, line, errors);
640        }
641
642        if matches!(key, "thumbnail" | "thumbnail_source")
643            && !val.is_empty()
644            && !val.starts_with("https://")
645        {
646            errors.push(ParseError {
647                line,
648                message: format!("field {key:?} must be HTTPS URL"),
649            });
650        }
651    }
652
653    if key == "status" {
654        validate_status(val, label, line, errors);
655    }
656
657    if key == "jurisdiction" && !val.is_empty() {
658        validate_jurisdiction(val, line, errors);
659    }
660
661    if key == "value" && !val.is_empty() {
662        validate_money(val, line, errors);
663    }
664}
665
666/// Validate a list-valued field (aliases, urls, role).
667fn validate_list_field(key: &str, items: &[String], line: usize, errors: &mut Vec<ParseError>) {
668    match key {
669        "aliases" => {
670            if items.len() > MAX_ALIASES {
671                errors.push(ParseError {
672                    line,
673                    message: format!(
674                        "aliases exceeds {MAX_ALIASES} items (got {})",
675                        items.len()
676                    ),
677                });
678            }
679            for item in items {
680                if item.len() > MAX_ALIAS_LEN {
681                    errors.push(ParseError {
682                        line,
683                        message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
684                    });
685                }
686            }
687        }
688        "urls" => {
689            if items.len() > MAX_URLS {
690                errors.push(ParseError {
691                    line,
692                    message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
693                });
694            }
695            for item in items {
696                if item.len() > MAX_URL_LEN {
697                    errors.push(ParseError {
698                        line,
699                        message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
700                    });
701                }
702                if !item.starts_with("https://") {
703                    errors.push(ParseError {
704                        line,
705                        message: format!("url must be HTTPS: {item:?}"),
706                    });
707                }
708            }
709        }
710        "role" => {
711            if items.len() > MAX_ROLES {
712                errors.push(ParseError {
713                    line,
714                    message: format!("role exceeds {MAX_ROLES} items (got {})", items.len()),
715                });
716            }
717            for item in items {
718                validate_enum_value("role", item, ROLE_VALUES, line, errors);
719            }
720        }
721        _ => {}
722    }
723}
724
725/// Maximum roles per person.
726const MAX_ROLES: usize = 10;
727
728/// Validate status value based on entity label context.
729fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
730    let allowed: &[&str] = match label {
731        Label::Person => PERSON_STATUS_VALUES,
732        Label::Organization => ORG_STATUS_VALUES,
733        Label::Asset => ASSET_STATUS_VALUES,
734        _ => {
735            errors.push(ParseError {
736                line,
737                message: format!("field \"status\" is not valid for {label}"),
738            });
739            return;
740        }
741    };
742
743    let normalized = value.to_lowercase().replace(' ', "_");
744    if !allowed.contains(&normalized.as_str()) {
745        errors.push(ParseError {
746            line,
747            message: format!(
748                "invalid status {value:?} for {label} (known: {})",
749                allowed.join(", ")
750            ),
751        });
752    }
753}
754
755/// Validate jurisdiction format: `XX` or `XX/Subdivision`.
756fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
757    if let Some(slash_pos) = value.find('/') {
758        let country = &value[..slash_pos];
759        let subdivision = &value[slash_pos + 1..];
760        if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
761            errors.push(ParseError {
762                line,
763                message: format!(
764                    "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
765                ),
766            });
767        }
768        if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
769            errors.push(ParseError {
770                line,
771                message: format!(
772                    "jurisdiction subdivision must be 1-{} chars",
773                    domain::MAX_SUBDIVISION_LEN
774                ),
775            });
776        }
777    } else {
778        // Just country code
779        if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
780            errors.push(ParseError {
781                line,
782                message: format!(
783                    "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
784                ),
785            });
786        }
787    }
788}
789
790/// Validate money DSL format: `amount currency "display"`.
791/// Example: `500000000000 IDR "Rp 500 billion"`
792fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
793    // Split: amount currency "display"
794    let parts: Vec<&str> = value.splitn(3, ' ').collect();
795    if parts.len() < 3 {
796        errors.push(ParseError {
797            line,
798            message: format!(
799                "invalid money format: expected `amount currency \"display\"`, got {value:?}"
800            ),
801        });
802        return;
803    }
804
805    // Validate amount is a valid integer
806    if parts[0].parse::<i64>().is_err() {
807        errors.push(ParseError {
808            line,
809            message: format!("money amount must be an integer, got {:?}", parts[0]),
810        });
811    }
812
813    // Validate currency is 3-letter uppercase
814    let currency = parts[1];
815    if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
816        errors.push(ParseError {
817            line,
818            message: format!(
819                "money currency must be 3-letter uppercase ISO code, got {currency:?}"
820            ),
821        });
822    }
823
824    // Validate display is quoted
825    let display = parts[2];
826    if !display.starts_with('"') || !display.ends_with('"') {
827        errors.push(ParseError {
828            line,
829            message: format!("money display must be quoted, got {display:?}"),
830        });
831    } else {
832        let inner = &display[1..display.len() - 1];
833        if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
834            errors.push(ParseError {
835                line,
836                message: format!(
837                    "money display exceeds {} chars (got {})",
838                    domain::MAX_MONEY_DISPLAY_LEN,
839                    inner.len()
840                ),
841            });
842        }
843    }
844}
845
846fn validate_enum_value(
847    key: &str,
848    value: &str,
849    allowed: &[&str],
850    line: usize,
851    errors: &mut Vec<ParseError>,
852) {
853    // custom: prefix is always valid (if non-empty after prefix, max 100 chars)
854    if let Some(custom) = value.strip_prefix("custom:") {
855        if custom.is_empty() || custom.len() > 100 {
856            errors.push(ParseError {
857                line,
858                message: format!(
859                    "field {key:?} custom value must be 1-100 chars, got {}",
860                    custom.len()
861                ),
862            });
863        }
864        return;
865    }
866
867    let normalized = value.to_lowercase().replace(' ', "_");
868    if !allowed.contains(&normalized.as_str()) {
869        errors.push(ParseError {
870            line,
871            message: format!(
872                "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
873                allowed.join(", ")
874            ),
875        });
876    }
877}
878
879fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
880    // Valid formats: YYYY, YYYY-MM, YYYY-MM-DD
881    let valid = matches!(value.len(), 4 | 7 | 10)
882        && value.chars().enumerate().all(|(i, c)| match i {
883            4 | 7 => c == '-',
884            _ => c.is_ascii_digit(),
885        });
886
887    if !valid {
888        errors.push(ParseError {
889            line,
890            message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
891        });
892    }
893}
894
895#[cfg(test)]
896mod tests {
897    use super::*;
898
899    #[test]
900    fn parse_person_entity() {
901        let body = [
902            "",
903            "### Mark Bonnick",
904            "- qualifier: Arsenal Kit Manager",
905            "- nationality: GB",
906            "- role: custom:Kit Manager",
907            "- date_of_birth: 1962",
908            "- description: Academy kit manager at Arsenal FC for 22 years",
909            "  (2001-2024). Age 62 at time of dismissal.",
910            "",
911        ]
912        .join("\n");
913
914        let mut errors = Vec::new();
915        let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
916        assert!(errors.is_empty(), "errors: {errors:?}");
917        assert_eq!(entities.len(), 1);
918
919        let e = &entities[0];
920        assert_eq!(e.name, "Mark Bonnick");
921        assert_eq!(e.label, Label::Person);
922        assert_eq!(e.fields.len(), 5);
923
924        // Check multi-line description
925        let desc = e
926            .fields
927            .iter()
928            .find(|(k, _)| k == "description")
929            .map(|(_, v)| v);
930        assert_eq!(
931            desc,
932            Some(&FieldValue::Single(
933                "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
934            ))
935        );
936    }
937
938    #[test]
939    fn parse_person_with_role_list() {
940        let body = "### Test\n- role: politician, executive\n";
941        let mut errors = Vec::new();
942        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
943        assert!(errors.is_empty(), "errors: {errors:?}");
944        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
945        assert_eq!(
946            roles.map(|(_, v)| v),
947            Some(&FieldValue::List(vec![
948                "politician".into(),
949                "executive".into(),
950            ]))
951        );
952    }
953
954    #[test]
955    fn parse_person_with_status() {
956        let body = "### Test\n- status: imprisoned\n";
957        let mut errors = Vec::new();
958        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
959        assert!(errors.is_empty(), "errors: {errors:?}");
960    }
961
962    #[test]
963    fn reject_invalid_person_status() {
964        let body = "### Test\n- status: unknown_status\n";
965        let mut errors = Vec::new();
966        parse_entities(body, SectionKind::People, 1, &mut errors);
967        assert!(errors.iter().any(|e| e.message.contains("invalid status")));
968    }
969
970    #[test]
971    fn parse_organization_with_type_shorthand() {
972        let body = [
973            "",
974            "### Arsenal FC",
975            "- type: sports_club",
976            "- jurisdiction: GB",
977            "- aliases: Arsenal, The Gunners, Arsenal Football Club",
978            "- urls:",
979            "  - https://www.arsenal.com",
980            "  - https://en.wikipedia.org/wiki/Arsenal_F.C.",
981            "",
982        ]
983        .join("\n");
984
985        let mut errors = Vec::new();
986        let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
987        assert!(errors.is_empty(), "errors: {errors:?}");
988        assert_eq!(entities.len(), 1);
989
990        let e = &entities[0];
991        assert_eq!(e.name, "Arsenal FC");
992        assert_eq!(e.label, Label::Organization);
993
994        // type: should have been expanded to org_type:
995        let it = e.fields.iter().find(|(k, _)| k == "org_type");
996        assert_eq!(
997            it.map(|(_, v)| v),
998            Some(&FieldValue::Single("sports_club".into()))
999        );
1000
1001        // aliases as comma-separated
1002        let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
1003        assert_eq!(
1004            aliases.map(|(_, v)| v),
1005            Some(&FieldValue::List(vec![
1006                "Arsenal".into(),
1007                "The Gunners".into(),
1008                "Arsenal Football Club".into(),
1009            ]))
1010        );
1011
1012        // urls as nested list
1013        let urls = e.fields.iter().find(|(k, _)| k == "urls");
1014        assert_eq!(
1015            urls.map(|(_, v)| v),
1016            Some(&FieldValue::List(vec![
1017                "https://www.arsenal.com".into(),
1018                "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
1019            ]))
1020        );
1021    }
1022
1023    #[test]
1024    fn parse_organization_with_jurisdiction_subdivision() {
1025        let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
1026        let mut errors = Vec::new();
1027        let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1028        assert!(errors.is_empty(), "errors: {errors:?}");
1029        let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1030        assert_eq!(
1031            j.map(|(_, v)| v),
1032            Some(&FieldValue::Single("ID/West Java".into()))
1033        );
1034    }
1035
1036    #[test]
1037    fn reject_invalid_jurisdiction() {
1038        let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1039        let mut errors = Vec::new();
1040        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1041        assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1042    }
1043
1044    #[test]
1045    fn parse_event_with_type_shorthand() {
1046        let body = [
1047            "",
1048            "### Bonnick dismissal",
1049            "- occurred_at: 2024-12-24",
1050            "- type: dismissal",
1051            "- description: Arsenal dismisses Bonnick.",
1052            "",
1053        ]
1054        .join("\n");
1055
1056        let mut errors = Vec::new();
1057        let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1058        assert!(errors.is_empty(), "errors: {errors:?}");
1059
1060        let e = &entities[0];
1061        assert_eq!(e.label, Label::Event);
1062        let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1063        assert_eq!(
1064            dt.map(|(_, v)| v),
1065            Some(&FieldValue::Single("dismissal".into()))
1066        );
1067    }
1068
1069    #[test]
1070    fn parse_event_with_severity() {
1071        let body =
1072            "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1073        let mut errors = Vec::new();
1074        let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1075        assert!(errors.is_empty(), "errors: {errors:?}");
1076    }
1077
1078    #[test]
1079    fn parse_document_entity() {
1080        let body = [
1081            "### Indictment No. 123",
1082            "- doc_type: indictment",
1083            "- issued_at: 2024-03-15",
1084            "- issuing_authority: Jakarta District Court",
1085            "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1086        ]
1087        .join("\n");
1088        let mut errors = Vec::new();
1089        let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1090        assert!(errors.is_empty(), "errors: {errors:?}");
1091        assert_eq!(entities.len(), 1);
1092        assert_eq!(entities[0].label, Label::Document);
1093    }
1094
1095    #[test]
1096    fn parse_asset_entity() {
1097        let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1098        let mut errors = Vec::new();
1099        let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1100        assert!(errors.is_empty(), "errors: {errors:?}");
1101        assert_eq!(entities.len(), 1);
1102        assert_eq!(entities[0].label, Label::Asset);
1103    }
1104
1105    #[test]
1106    fn reject_invalid_money_format() {
1107        let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1108        let mut errors = Vec::new();
1109        parse_entities(body, SectionKind::Assets, 1, &mut errors);
1110        assert!(errors.iter().any(|e| e.message.contains("money")));
1111    }
1112
1113    #[test]
1114    fn reject_unknown_field() {
1115        let body = "### Test\n- foobar: value\n";
1116        let mut errors = Vec::new();
1117        parse_entities(body, SectionKind::People, 1, &mut errors);
1118        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1119    }
1120
1121    #[test]
1122    fn reject_wrong_label_field() {
1123        // org_type on a person
1124        let body = "### Test\n- org_type: court\n";
1125        let mut errors = Vec::new();
1126        parse_entities(body, SectionKind::People, 1, &mut errors);
1127        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1128    }
1129
1130    #[test]
1131    fn reject_invalid_enum_value() {
1132        let body = "### Test\n- role: wizard\n";
1133        let mut errors = Vec::new();
1134        parse_entities(body, SectionKind::People, 1, &mut errors);
1135        assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1136    }
1137
1138    #[test]
1139    fn accept_custom_enum_value() {
1140        let body = "### Test\n- role: custom:Kit Manager\n";
1141        let mut errors = Vec::new();
1142        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1143        assert!(errors.is_empty(), "errors: {errors:?}");
1144        assert_eq!(entities.len(), 1);
1145    }
1146
1147    #[test]
1148    fn normalize_enum_value_spaces_to_underscores() {
1149        let body = "### Test\n- role: civil servant\n";
1150        let mut errors = Vec::new();
1151        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1152        assert!(errors.is_empty(), "errors: {errors:?}");
1153        assert_eq!(entities.len(), 1);
1154        let val = entities[0]
1155            .fields
1156            .iter()
1157            .find(|(k, _)| k == "role")
1158            .map(|(_, v)| match v {
1159                FieldValue::Single(s) => s.as_str(),
1160                _ => "",
1161            });
1162        assert_eq!(val, Some("civil_servant"));
1163    }
1164
1165    #[test]
1166    fn normalize_enum_list_values() {
1167        let body = "### Test\n- role: civil servant, law enforcement\n";
1168        let mut errors = Vec::new();
1169        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1170        assert!(errors.is_empty(), "errors: {errors:?}");
1171        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1172        assert_eq!(
1173            roles.map(|(_, v)| v),
1174            Some(&FieldValue::List(vec![
1175                "civil_servant".into(),
1176                "law_enforcement".into(),
1177            ]))
1178        );
1179    }
1180
1181    #[test]
1182    fn reject_invalid_date_format() {
1183        let body = "### Test\n- date_of_birth: January 1990\n";
1184        let mut errors = Vec::new();
1185        parse_entities(body, SectionKind::People, 1, &mut errors);
1186        assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1187    }
1188
1189    #[test]
1190    fn accept_valid_date_formats() {
1191        for date in &["2024", "2024-01", "2024-01-15"] {
1192            let body = format!("### Test\n- date_of_birth: {date}\n");
1193            let mut errors = Vec::new();
1194            parse_entities(&body, SectionKind::People, 1, &mut errors);
1195            assert!(
1196                errors.is_empty(),
1197                "date {date:?} should be valid: {errors:?}"
1198            );
1199        }
1200    }
1201
1202    #[test]
1203    fn reject_non_https_url() {
1204        let body = "### Test\n- urls:\n  - http://example.com\n";
1205        let mut errors = Vec::new();
1206        parse_entities(body, SectionKind::People, 1, &mut errors);
1207        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1208    }
1209
1210    #[test]
1211    fn reject_non_https_thumbnail() {
1212        let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1213        let mut errors = Vec::new();
1214        parse_entities(body, SectionKind::People, 1, &mut errors);
1215        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1216    }
1217
1218    #[test]
1219    fn multiple_entities() {
1220        let body = [
1221            "",
1222            "### Alice",
1223            "- nationality: NL",
1224            "",
1225            "### Bob",
1226            "- nationality: GB",
1227            "",
1228        ]
1229        .join("\n");
1230
1231        let mut errors = Vec::new();
1232        let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1233        assert!(errors.is_empty(), "errors: {errors:?}");
1234        assert_eq!(entities.len(), 2);
1235        assert_eq!(entities[0].name, "Alice");
1236        assert_eq!(entities[1].name, "Bob");
1237    }
1238
1239    #[test]
1240    fn field_max_length_violation() {
1241        let long_val = "a".repeat(201);
1242        let body = format!("### Test\n- nationality: {long_val}\n");
1243        let mut errors = Vec::new();
1244        parse_entities(&body, SectionKind::People, 1, &mut errors);
1245        assert!(
1246            errors
1247                .iter()
1248                .any(|e| e.message.contains("exceeds 100 chars"))
1249        );
1250    }
1251
1252    #[test]
1253    fn too_many_aliases() {
1254        let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1255        let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1256        let mut errors = Vec::new();
1257        parse_entities(&body, SectionKind::People, 1, &mut errors);
1258        assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1259    }
1260
1261    #[test]
1262    fn require_org_type_for_organizations() {
1263        let body = "### Test Corp\n- qualifier: Test\n";
1264        let mut errors = Vec::new();
1265        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1266        assert!(
1267            errors
1268                .iter()
1269                .any(|e| { e.message.contains("missing required field \"org_type\"") })
1270        );
1271    }
1272
1273    #[test]
1274    fn accept_organization_with_type() {
1275        let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1276        let mut errors = Vec::new();
1277        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1278        assert!(errors.is_empty(), "errors: {errors:?}");
1279    }
1280}