Skip to main content

weave_content/
entity.rs

1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5/// Maximum entities per file.
6const MAX_ENTITIES_PER_FILE: usize = 50;
7
8/// Maximum length of an entity name.
9const MAX_NAME_LEN: usize = 300;
10
11/// Label derived from the section an entity appears in.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14    Person,
15    Organization,
16    Event,
17    Document,
18    Asset,
19}
20
21impl fmt::Display for Label {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        match self {
24            Self::Person => write!(f, "person"),
25            Self::Organization => write!(f, "organization"),
26            Self::Event => write!(f, "event"),
27            Self::Document => write!(f, "document"),
28            Self::Asset => write!(f, "asset"),
29        }
30    }
31}
32
33impl Label {
34    pub fn from_section(kind: SectionKind) -> Option<Self> {
35        match kind {
36            SectionKind::People => Some(Self::Person),
37            SectionKind::Organizations => Some(Self::Organization),
38            SectionKind::Events => Some(Self::Event),
39            SectionKind::Documents => Some(Self::Document),
40            SectionKind::Assets => Some(Self::Asset),
41            _ => None,
42        }
43    }
44}
45
46/// A parsed entity with its name, label, and field map.
47#[derive(Debug, Clone)]
48pub struct Entity {
49    pub name: String,
50    pub label: Label,
51    pub fields: Vec<(String, FieldValue)>,
52    /// Stored NULID from `id:` field (None if not yet generated).
53    pub id: Option<String>,
54    /// Line number (1-indexed) of the H3 heading.
55    pub line: usize,
56    /// Tags from front matter (empty for inline entities).
57    pub tags: Vec<String>,
58    /// File-path slug (e.g. `people/id/harvey-moeis`). Only set for
59    /// registry entities that have standalone files.
60    pub slug: Option<String>,
61}
62
63/// A field value: either a single string or a list of strings.
64#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum FieldValue {
66    Single(String),
67    List(Vec<String>),
68}
69
70/// Parse a single entity from a standalone entity file body.
71/// The body is the text after the H1 heading (bullet fields, no H3 headings).
72/// `label` is determined by the file's directory (people/ or organizations/).
73/// `id` comes from the front matter (may be None).
74pub fn parse_entity_file_body(
75    name: &str,
76    body: &str,
77    label: Label,
78    id: Option<String>,
79    title_line: usize,
80    errors: &mut Vec<ParseError>,
81) -> Entity {
82    let section_kind = match label {
83        Label::Person => SectionKind::People,
84        Label::Organization => SectionKind::Organizations,
85        Label::Event => SectionKind::Events,
86        Label::Document => SectionKind::Documents,
87        Label::Asset => SectionKind::Assets,
88    };
89
90    // Wrap the body with a fake H3 heading so we can reuse parse_entities
91    let wrapped = format!("### {name}\n{body}");
92    let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
93
94    if let Some(mut entity) = entities.pop() {
95        entity.id = id;
96        entity.line = title_line;
97        entity
98    } else {
99        Entity {
100            name: name.to_string(),
101            label,
102            fields: Vec::new(),
103            id,
104            line: title_line,
105            tags: Vec::new(),
106            slug: None,
107        }
108    }
109}
110
111/// Parse entities from an entity section (People, Organizations, Events).
112/// The `body` is the text between the H2 heading and the next H2 heading.
113/// `section_start_line` is the line number of the H2 heading in the original file.
114#[allow(clippy::too_many_lines)]
115pub fn parse_entities(
116    body: &str,
117    section_kind: SectionKind,
118    section_start_line: usize,
119    errors: &mut Vec<ParseError>,
120) -> Vec<Entity> {
121    let Some(label) = Label::from_section(section_kind) else {
122        return Vec::new();
123    };
124
125    let lines: Vec<&str> = body.lines().collect();
126    let mut entities: Vec<Entity> = Vec::new();
127    let mut current_name: Option<String> = None;
128    let mut current_line: usize = 0;
129    let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
130    // Track multi-line value continuation and nested list building
131    let mut pending_list_key: Option<String> = None;
132    let mut pending_list_items: Vec<String> = Vec::new();
133
134    for (i, line) in lines.iter().enumerate() {
135        let file_line = section_start_line + 1 + i; // +1 because body starts after the H2 heading line
136
137        // Check for H3 heading
138        if let Some(name) = strip_h3(line) {
139            // Flush pending list
140            flush_pending_list(
141                &mut pending_list_key,
142                &mut pending_list_items,
143                &mut current_fields,
144            );
145
146            // Flush previous entity
147            if let Some(entity_name) = current_name.take() {
148                let entity = build_entity(
149                    entity_name,
150                    label,
151                    current_line,
152                    &mut current_fields,
153                    errors,
154                );
155                entities.push(entity);
156            }
157
158            current_name = Some(name.to_string());
159            current_line = file_line;
160            current_fields.clear();
161            continue;
162        }
163
164        // Only parse bullet fields if we're inside an entity (after an H3)
165        if current_name.is_none() {
166            if !line.trim().is_empty() {
167                errors.push(ParseError {
168                    line: file_line,
169                    message: "content before first entity heading (### Name)".into(),
170                });
171            }
172            continue;
173        }
174
175        let trimmed = line.trim();
176
177        // Nested list item: `  - value` (2-space indent + dash)
178        if let Some(item) = trimmed.strip_prefix("- ") {
179            if line.starts_with("  - ") && pending_list_key.is_some() {
180                // Nested list item for pending list key
181                pending_list_items.push(item.trim().to_string());
182                continue;
183            }
184
185            // Flush pending list before processing new top-level bullet
186            flush_pending_list(
187                &mut pending_list_key,
188                &mut pending_list_items,
189                &mut current_fields,
190            );
191
192            // Top-level bullet: `- key: value` or `- key:`
193            if let Some((key, value)) = parse_bullet(item) {
194                if value.is_empty() {
195                    // Start a nested list: `- urls:`
196                    pending_list_key = Some(key);
197                    pending_list_items.clear();
198                } else if is_list_field(&key) && value.contains(',') {
199                    // Comma-separated list: `- aliases: A, B, C`
200                    let items: Vec<String> = value
201                        .split(',')
202                        .map(|s| s.trim().to_string())
203                        .filter(|s| !s.is_empty())
204                        .collect();
205                    current_fields.push((key, FieldValue::List(items)));
206                } else {
207                    current_fields.push((key, FieldValue::Single(value)));
208                }
209            } else {
210                errors.push(ParseError {
211                    line: file_line,
212                    message: format!(
213                        "invalid field syntax: expected `- key: value`, got {trimmed:?}"
214                    ),
215                });
216            }
217            continue;
218        }
219
220        // Multi-line value continuation (2-space indent, not a bullet)
221        if line.starts_with("  ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
222            if pending_list_key.is_some() {
223                // Could be continuation inside a list context -- treat as error
224                errors.push(ParseError {
225                    line: file_line,
226                    message: "unexpected indented text in list context".into(),
227                });
228            } else if let Some(last) = current_fields.last_mut() {
229                match last.1 {
230                    FieldValue::Single(ref mut val) => {
231                        val.push('\n');
232                        val.push_str(trimmed);
233                    }
234                    FieldValue::List(ref mut items) => {
235                        // Continuation of a comma-separated list field.
236                        // Join last item with continuation text, then re-split
237                        // in case new commas appear.
238                        let tail = items.pop().unwrap_or_default();
239                        let joined = if tail.is_empty() {
240                            trimmed.to_string()
241                        } else {
242                            format!("{tail} {trimmed}")
243                        };
244                        for part in joined.split(',') {
245                            let part = part.trim().to_string();
246                            if !part.is_empty() {
247                                items.push(part);
248                            }
249                        }
250                    }
251                }
252            }
253            continue;
254        }
255
256        // Blank line or other content -- ignore
257        if !trimmed.is_empty() {
258            // Flush pending list on non-indented non-bullet content
259            flush_pending_list(
260                &mut pending_list_key,
261                &mut pending_list_items,
262                &mut current_fields,
263            );
264        }
265    }
266
267    // Flush final pending list and entity
268    flush_pending_list(
269        &mut pending_list_key,
270        &mut pending_list_items,
271        &mut current_fields,
272    );
273
274    if let Some(entity_name) = current_name.take() {
275        let entity = build_entity(
276            entity_name,
277            label,
278            current_line,
279            &mut current_fields,
280            errors,
281        );
282        entities.push(entity);
283    }
284
285    // Boundary check
286    if entities.len() > MAX_ENTITIES_PER_FILE {
287        errors.push(ParseError {
288            line: section_start_line,
289            message: format!(
290                "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
291                entities.len()
292            ),
293        });
294    }
295
296    entities
297}
298
299fn flush_pending_list(
300    pending_key: &mut Option<String>,
301    pending_items: &mut Vec<String>,
302    fields: &mut Vec<(String, FieldValue)>,
303) {
304    if let Some(key) = pending_key.take() {
305        fields.push((key, FieldValue::List(std::mem::take(pending_items))));
306    }
307}
308
309fn build_entity(
310    name: String,
311    label: Label,
312    line: usize,
313    fields: &mut Vec<(String, FieldValue)>,
314    errors: &mut Vec<ParseError>,
315) -> Entity {
316    // Validate name
317    if name.trim().is_empty() {
318        errors.push(ParseError {
319            line,
320            message: "entity name must not be empty".into(),
321        });
322    } else if name.len() > MAX_NAME_LEN {
323        errors.push(ParseError {
324            line,
325            message: format!(
326                "entity name exceeds {MAX_NAME_LEN} chars (got {})",
327                name.len()
328            ),
329        });
330    }
331
332    // Extract id field before validation (not a schema field)
333    let id = extract_id_field(fields);
334
335    // Apply type: shorthand
336    apply_type_shorthand(fields, label);
337
338    // Normalize enum field values (lowercase, spaces → underscores)
339    normalize_enum_fields(fields);
340
341    // Validate fields against schema
342    validate_fields(fields, label, line, errors);
343
344    Entity {
345        name,
346        label,
347        fields: std::mem::take(fields),
348        id,
349        line,
350        tags: Vec::new(),
351        slug: None,
352    }
353}
354
355/// Extract and remove the `id` field from the field list.
356fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
357    let pos = fields.iter().position(|(k, _)| k == "id")?;
358    let (_, value) = fields.remove(pos);
359    match value {
360        FieldValue::Single(s) if !s.is_empty() => Some(s),
361        _ => None,
362    }
363}
364
365/// Replace `type:` shorthand with the label-specific field name.
366fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
367    for field in fields.iter_mut() {
368        if field.0 == "type" {
369            field.0 = match label {
370                Label::Organization => "org_type".to_string(),
371                Label::Event => "event_type".to_string(),
372                Label::Document => "doc_type".to_string(),
373                Label::Asset => "asset_type".to_string(),
374                Label::Person => "type".to_string(), // will be caught as unknown
375            };
376        }
377    }
378}
379
380/// Parse `key: value` from a bullet item (after stripping `- `).
381fn parse_bullet(item: &str) -> Option<(String, String)> {
382    let colon_pos = item.find(':')?;
383    let key = item[..colon_pos].trim();
384    if key.is_empty() {
385        return None;
386    }
387    let value = item[colon_pos + 1..].trim();
388    Some((key.to_string(), value.to_string()))
389}
390
391/// Check if a field name is a list-type field.
392fn is_list_field(key: &str) -> bool {
393    matches!(key, "aliases" | "urls" | "role")
394}
395
396/// Strip an H3 heading prefix. Returns the heading text.
397fn strip_h3(line: &str) -> Option<&str> {
398    let trimmed = line.trim_start();
399    if let Some(rest) = trimmed.strip_prefix("### ") {
400        // Must not be H4+
401        if !rest.starts_with('#') {
402            return Some(rest.trim());
403        }
404    }
405    None
406}
407
408// --- Field validation ---
409
410/// Known fields per label (common + label-specific).
411const COMMON_FIELDS: &[&str] = &[
412    "qualifier",
413    "aliases",
414    "thumbnail",
415    "thumbnail_source",
416    "urls",
417    "description",
418];
419
420const PERSON_FIELDS: &[&str] = &[
421    "role",
422    "nationality",
423    "date_of_birth",
424    "place_of_birth",
425    "status",
426];
427
428const ORGANIZATION_FIELDS: &[&str] = &[
429    "org_type",
430    "jurisdiction",
431    "headquarters",
432    "founded_date",
433    "registration_number",
434    "status",
435];
436
437const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
438
439const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
440
441const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
442
443/// Known enum values — delegated to domain module constants.
444use crate::domain;
445
446const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
447const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
448const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
449const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
450const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
451const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
452const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
453const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
454const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
455
456/// Field max lengths.
457struct FieldConstraint {
458    max_len: usize,
459    /// If Some, the field is an enum with these known values.
460    enum_values: Option<&'static [&'static str]>,
461}
462
463fn field_constraint(key: &str) -> Option<FieldConstraint> {
464    match key {
465        "description" => Some(FieldConstraint {
466            max_len: 2000,
467            enum_values: None,
468        }),
469        "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
470            max_len: 2048,
471            enum_values: None,
472        }),
473        "occurred_at" | "date_of_birth" | "founded_date" | "issued_at" | "opened_at"
474        | "closed_at" => Some(FieldConstraint {
475            max_len: 10,
476            enum_values: None,
477        }),
478        "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
479            Some(FieldConstraint {
480                max_len: 200,
481                enum_values: None,
482            })
483        }
484        "jurisdiction" => Some(FieldConstraint {
485            // jurisdiction: ID or ID/South Sulawesi (country + optional subdivision)
486            max_len: 203, // 2 + 1 + 200
487            enum_values: None,
488        }),
489        "role" => Some(FieldConstraint {
490            max_len: 100,
491            enum_values: Some(ROLE_VALUES),
492        }),
493        "org_type" => Some(FieldConstraint {
494            max_len: 100,
495            enum_values: Some(ORG_TYPE_VALUES),
496        }),
497        "event_type" => Some(FieldConstraint {
498            max_len: 100,
499            enum_values: Some(EVENT_TYPE_VALUES),
500        }),
501        "doc_type" => Some(FieldConstraint {
502            max_len: 100,
503            enum_values: Some(DOC_TYPE_VALUES),
504        }),
505        "asset_type" => Some(FieldConstraint {
506            max_len: 100,
507            enum_values: Some(ASSET_TYPE_VALUES),
508        }),
509        "severity" => Some(FieldConstraint {
510            max_len: 20,
511            enum_values: Some(SEVERITY_VALUES),
512        }),
513        "status" => Some(FieldConstraint {
514            // Status validation is context-dependent (Person vs Org vs Asset),
515            // handled separately in validate_fields.
516            max_len: 30,
517            enum_values: None,
518        }),
519        "qualifier" | "nationality" | "case_number" | "registration_number" => {
520            Some(FieldConstraint {
521                max_len: 100,
522                enum_values: None,
523            })
524        }
525        // List fields validated separately
526        _ => None,
527    }
528}
529
530/// Maximum items in list fields.
531const MAX_ALIASES: usize = 10;
532const MAX_ALIAS_LEN: usize = 200;
533const MAX_URLS: usize = 10;
534const MAX_URL_LEN: usize = 2048;
535
536/// Normalize enum field values in-place: lowercase and replace spaces with
537/// underscores. Values with the `custom:` prefix are left unchanged.
538/// Handles both single-value and list-value enum fields.
539fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
540    for (key, value) in fields.iter_mut() {
541        let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
542
543        match value {
544            FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
545                let normalized = val.to_lowercase().replace(' ', "_");
546                if normalized != *val {
547                    *val = normalized;
548                }
549            }
550            FieldValue::List(items) if is_enum => {
551                for item in items.iter_mut() {
552                    if !item.starts_with("custom:") {
553                        let normalized = item.to_lowercase().replace(' ', "_");
554                        if normalized != *item {
555                            *item = normalized;
556                        }
557                    }
558                }
559            }
560            _ => {}
561        }
562    }
563}
564
565fn validate_fields(
566    fields: &[(String, FieldValue)],
567    label: Label,
568    line: usize,
569    errors: &mut Vec<ParseError>,
570) {
571    let label_fields: &[&str] = match label {
572        Label::Person => PERSON_FIELDS,
573        Label::Organization => ORGANIZATION_FIELDS,
574        Label::Event => EVENT_FIELDS,
575        Label::Document => DOCUMENT_FIELDS,
576        Label::Asset => ASSET_FIELDS,
577    };
578
579    for (key, value) in fields {
580        if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
581            errors.push(ParseError {
582                line,
583                message: format!("unknown field {key:?} for {label}"),
584            });
585            continue;
586        }
587
588        match value {
589            FieldValue::Single(val) => validate_single_field(key, val, label, line, errors),
590            FieldValue::List(items) => validate_list_field(key, items, line, errors),
591        }
592    }
593
594    // Required field checks
595    if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
596        errors.push(ParseError {
597            line,
598            message: "organization entity missing required field \"org_type\"".into(),
599        });
600    }
601}
602
603/// Validate a single-valued field (length, enum, date, URL, status, jurisdiction, money).
604fn validate_single_field(
605    key: &str,
606    val: &str,
607    label: Label,
608    line: usize,
609    errors: &mut Vec<ParseError>,
610) {
611    if let Some(constraint) = field_constraint(key) {
612        if val.len() > constraint.max_len {
613            errors.push(ParseError {
614                line,
615                message: format!(
616                    "field {key:?} exceeds {} chars (got {})",
617                    constraint.max_len,
618                    val.len()
619                ),
620            });
621        }
622
623        if let Some(allowed) = constraint.enum_values {
624            validate_enum_value(key, val, allowed, line, errors);
625        }
626
627        if matches!(
628            key,
629            "occurred_at"
630                | "date_of_birth"
631                | "founded_date"
632                | "issued_at"
633                | "opened_at"
634                | "closed_at"
635        ) && !val.is_empty()
636        {
637            validate_date_format(key, val, line, errors);
638        }
639
640        if matches!(key, "thumbnail" | "thumbnail_source")
641            && !val.is_empty()
642            && !val.starts_with("https://")
643        {
644            errors.push(ParseError {
645                line,
646                message: format!("field {key:?} must be HTTPS URL"),
647            });
648        }
649    }
650
651    if key == "status" {
652        validate_status(val, label, line, errors);
653    }
654
655    if key == "jurisdiction" && !val.is_empty() {
656        validate_jurisdiction(val, line, errors);
657    }
658
659    if key == "value" && !val.is_empty() {
660        validate_money(val, line, errors);
661    }
662}
663
664/// Validate a list-valued field (aliases, urls, role).
665fn validate_list_field(key: &str, items: &[String], line: usize, errors: &mut Vec<ParseError>) {
666    match key {
667        "aliases" => {
668            if items.len() > MAX_ALIASES {
669                errors.push(ParseError {
670                    line,
671                    message: format!(
672                        "aliases exceeds {MAX_ALIASES} items (got {})",
673                        items.len()
674                    ),
675                });
676            }
677            for item in items {
678                if item.len() > MAX_ALIAS_LEN {
679                    errors.push(ParseError {
680                        line,
681                        message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
682                    });
683                }
684            }
685        }
686        "urls" => {
687            if items.len() > MAX_URLS {
688                errors.push(ParseError {
689                    line,
690                    message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
691                });
692            }
693            for item in items {
694                if item.len() > MAX_URL_LEN {
695                    errors.push(ParseError {
696                        line,
697                        message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
698                    });
699                }
700                if !item.starts_with("https://") {
701                    errors.push(ParseError {
702                        line,
703                        message: format!("url must be HTTPS: {item:?}"),
704                    });
705                }
706            }
707        }
708        "role" => {
709            if items.len() > MAX_ROLES {
710                errors.push(ParseError {
711                    line,
712                    message: format!("role exceeds {MAX_ROLES} items (got {})", items.len()),
713                });
714            }
715            for item in items {
716                validate_enum_value("role", item, ROLE_VALUES, line, errors);
717            }
718        }
719        _ => {}
720    }
721}
722
723/// Maximum roles per person.
724const MAX_ROLES: usize = 10;
725
726/// Validate status value based on entity label context.
727fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
728    let allowed: &[&str] = match label {
729        Label::Person => PERSON_STATUS_VALUES,
730        Label::Organization => ORG_STATUS_VALUES,
731        Label::Asset => ASSET_STATUS_VALUES,
732        _ => {
733            errors.push(ParseError {
734                line,
735                message: format!("field \"status\" is not valid for {label}"),
736            });
737            return;
738        }
739    };
740
741    let normalized = value.to_lowercase().replace(' ', "_");
742    if !allowed.contains(&normalized.as_str()) {
743        errors.push(ParseError {
744            line,
745            message: format!(
746                "invalid status {value:?} for {label} (known: {})",
747                allowed.join(", ")
748            ),
749        });
750    }
751}
752
753/// Validate jurisdiction format: `XX` or `XX/Subdivision`.
754fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
755    if let Some(slash_pos) = value.find('/') {
756        let country = &value[..slash_pos];
757        let subdivision = &value[slash_pos + 1..];
758        if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
759            errors.push(ParseError {
760                line,
761                message: format!(
762                    "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
763                ),
764            });
765        }
766        if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
767            errors.push(ParseError {
768                line,
769                message: format!(
770                    "jurisdiction subdivision must be 1-{} chars",
771                    domain::MAX_SUBDIVISION_LEN
772                ),
773            });
774        }
775    } else {
776        // Just country code
777        if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
778            errors.push(ParseError {
779                line,
780                message: format!(
781                    "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
782                ),
783            });
784        }
785    }
786}
787
788/// Validate money DSL format: `amount currency "display"`.
789/// Example: `500000000000 IDR "Rp 500 billion"`
790fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
791    // Split: amount currency "display"
792    let parts: Vec<&str> = value.splitn(3, ' ').collect();
793    if parts.len() < 3 {
794        errors.push(ParseError {
795            line,
796            message: format!(
797                "invalid money format: expected `amount currency \"display\"`, got {value:?}"
798            ),
799        });
800        return;
801    }
802
803    // Validate amount is a valid integer
804    if parts[0].parse::<i64>().is_err() {
805        errors.push(ParseError {
806            line,
807            message: format!("money amount must be an integer, got {:?}", parts[0]),
808        });
809    }
810
811    // Validate currency is 3-letter uppercase
812    let currency = parts[1];
813    if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
814        errors.push(ParseError {
815            line,
816            message: format!(
817                "money currency must be 3-letter uppercase ISO code, got {currency:?}"
818            ),
819        });
820    }
821
822    // Validate display is quoted
823    let display = parts[2];
824    if !display.starts_with('"') || !display.ends_with('"') {
825        errors.push(ParseError {
826            line,
827            message: format!("money display must be quoted, got {display:?}"),
828        });
829    } else {
830        let inner = &display[1..display.len() - 1];
831        if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
832            errors.push(ParseError {
833                line,
834                message: format!(
835                    "money display exceeds {} chars (got {})",
836                    domain::MAX_MONEY_DISPLAY_LEN,
837                    inner.len()
838                ),
839            });
840        }
841    }
842}
843
844fn validate_enum_value(
845    key: &str,
846    value: &str,
847    allowed: &[&str],
848    line: usize,
849    errors: &mut Vec<ParseError>,
850) {
851    // custom: prefix is always valid (if non-empty after prefix, max 100 chars)
852    if let Some(custom) = value.strip_prefix("custom:") {
853        if custom.is_empty() || custom.len() > 100 {
854            errors.push(ParseError {
855                line,
856                message: format!(
857                    "field {key:?} custom value must be 1-100 chars, got {}",
858                    custom.len()
859                ),
860            });
861        }
862        return;
863    }
864
865    let normalized = value.to_lowercase().replace(' ', "_");
866    if !allowed.contains(&normalized.as_str()) {
867        errors.push(ParseError {
868            line,
869            message: format!(
870                "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
871                allowed.join(", ")
872            ),
873        });
874    }
875}
876
877fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
878    // Valid formats: YYYY, YYYY-MM, YYYY-MM-DD
879    let valid = matches!(value.len(), 4 | 7 | 10)
880        && value.chars().enumerate().all(|(i, c)| match i {
881            4 | 7 => c == '-',
882            _ => c.is_ascii_digit(),
883        });
884
885    if !valid {
886        errors.push(ParseError {
887            line,
888            message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
889        });
890    }
891}
892
893#[cfg(test)]
894mod tests {
895    use super::*;
896
897    #[test]
898    fn parse_person_entity() {
899        let body = [
900            "",
901            "### Mark Bonnick",
902            "- qualifier: Arsenal Kit Manager",
903            "- nationality: GB",
904            "- role: custom:Kit Manager",
905            "- date_of_birth: 1962",
906            "- description: Academy kit manager at Arsenal FC for 22 years",
907            "  (2001-2024). Age 62 at time of dismissal.",
908            "",
909        ]
910        .join("\n");
911
912        let mut errors = Vec::new();
913        let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
914        assert!(errors.is_empty(), "errors: {errors:?}");
915        assert_eq!(entities.len(), 1);
916
917        let e = &entities[0];
918        assert_eq!(e.name, "Mark Bonnick");
919        assert_eq!(e.label, Label::Person);
920        assert_eq!(e.fields.len(), 5);
921
922        // Check multi-line description
923        let desc = e
924            .fields
925            .iter()
926            .find(|(k, _)| k == "description")
927            .map(|(_, v)| v);
928        assert_eq!(
929            desc,
930            Some(&FieldValue::Single(
931                "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
932            ))
933        );
934    }
935
936    #[test]
937    fn parse_person_with_role_list() {
938        let body = "### Test\n- role: politician, executive\n";
939        let mut errors = Vec::new();
940        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
941        assert!(errors.is_empty(), "errors: {errors:?}");
942        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
943        assert_eq!(
944            roles.map(|(_, v)| v),
945            Some(&FieldValue::List(vec![
946                "politician".into(),
947                "executive".into(),
948            ]))
949        );
950    }
951
952    #[test]
953    fn parse_person_with_status() {
954        let body = "### Test\n- status: imprisoned\n";
955        let mut errors = Vec::new();
956        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
957        assert!(errors.is_empty(), "errors: {errors:?}");
958    }
959
960    #[test]
961    fn reject_invalid_person_status() {
962        let body = "### Test\n- status: unknown_status\n";
963        let mut errors = Vec::new();
964        parse_entities(body, SectionKind::People, 1, &mut errors);
965        assert!(errors.iter().any(|e| e.message.contains("invalid status")));
966    }
967
968    #[test]
969    fn parse_organization_with_type_shorthand() {
970        let body = [
971            "",
972            "### Arsenal FC",
973            "- type: sports_club",
974            "- jurisdiction: GB",
975            "- aliases: Arsenal, The Gunners, Arsenal Football Club",
976            "- urls:",
977            "  - https://www.arsenal.com",
978            "  - https://en.wikipedia.org/wiki/Arsenal_F.C.",
979            "",
980        ]
981        .join("\n");
982
983        let mut errors = Vec::new();
984        let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
985        assert!(errors.is_empty(), "errors: {errors:?}");
986        assert_eq!(entities.len(), 1);
987
988        let e = &entities[0];
989        assert_eq!(e.name, "Arsenal FC");
990        assert_eq!(e.label, Label::Organization);
991
992        // type: should have been expanded to org_type:
993        let it = e.fields.iter().find(|(k, _)| k == "org_type");
994        assert_eq!(
995            it.map(|(_, v)| v),
996            Some(&FieldValue::Single("sports_club".into()))
997        );
998
999        // aliases as comma-separated
1000        let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
1001        assert_eq!(
1002            aliases.map(|(_, v)| v),
1003            Some(&FieldValue::List(vec![
1004                "Arsenal".into(),
1005                "The Gunners".into(),
1006                "Arsenal Football Club".into(),
1007            ]))
1008        );
1009
1010        // urls as nested list
1011        let urls = e.fields.iter().find(|(k, _)| k == "urls");
1012        assert_eq!(
1013            urls.map(|(_, v)| v),
1014            Some(&FieldValue::List(vec![
1015                "https://www.arsenal.com".into(),
1016                "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
1017            ]))
1018        );
1019    }
1020
1021    #[test]
1022    fn parse_organization_with_jurisdiction_subdivision() {
1023        let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
1024        let mut errors = Vec::new();
1025        let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1026        assert!(errors.is_empty(), "errors: {errors:?}");
1027        let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1028        assert_eq!(
1029            j.map(|(_, v)| v),
1030            Some(&FieldValue::Single("ID/West Java".into()))
1031        );
1032    }
1033
1034    #[test]
1035    fn reject_invalid_jurisdiction() {
1036        let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1037        let mut errors = Vec::new();
1038        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1039        assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1040    }
1041
1042    #[test]
1043    fn parse_event_with_type_shorthand() {
1044        let body = [
1045            "",
1046            "### Bonnick dismissal",
1047            "- occurred_at: 2024-12-24",
1048            "- type: dismissal",
1049            "- description: Arsenal dismisses Bonnick.",
1050            "",
1051        ]
1052        .join("\n");
1053
1054        let mut errors = Vec::new();
1055        let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1056        assert!(errors.is_empty(), "errors: {errors:?}");
1057
1058        let e = &entities[0];
1059        assert_eq!(e.label, Label::Event);
1060        let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1061        assert_eq!(
1062            dt.map(|(_, v)| v),
1063            Some(&FieldValue::Single("dismissal".into()))
1064        );
1065    }
1066
1067    #[test]
1068    fn parse_event_with_severity() {
1069        let body =
1070            "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1071        let mut errors = Vec::new();
1072        let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1073        assert!(errors.is_empty(), "errors: {errors:?}");
1074    }
1075
1076    #[test]
1077    fn parse_document_entity() {
1078        let body = [
1079            "### Indictment No. 123",
1080            "- doc_type: indictment",
1081            "- issued_at: 2024-03-15",
1082            "- issuing_authority: Jakarta District Court",
1083            "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1084        ]
1085        .join("\n");
1086        let mut errors = Vec::new();
1087        let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1088        assert!(errors.is_empty(), "errors: {errors:?}");
1089        assert_eq!(entities.len(), 1);
1090        assert_eq!(entities[0].label, Label::Document);
1091    }
1092
1093    #[test]
1094    fn parse_asset_entity() {
1095        let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1096        let mut errors = Vec::new();
1097        let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1098        assert!(errors.is_empty(), "errors: {errors:?}");
1099        assert_eq!(entities.len(), 1);
1100        assert_eq!(entities[0].label, Label::Asset);
1101    }
1102
1103    #[test]
1104    fn reject_invalid_money_format() {
1105        let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1106        let mut errors = Vec::new();
1107        parse_entities(body, SectionKind::Assets, 1, &mut errors);
1108        assert!(errors.iter().any(|e| e.message.contains("money")));
1109    }
1110
1111    #[test]
1112    fn reject_unknown_field() {
1113        let body = "### Test\n- foobar: value\n";
1114        let mut errors = Vec::new();
1115        parse_entities(body, SectionKind::People, 1, &mut errors);
1116        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1117    }
1118
1119    #[test]
1120    fn reject_wrong_label_field() {
1121        // org_type on a person
1122        let body = "### Test\n- org_type: court\n";
1123        let mut errors = Vec::new();
1124        parse_entities(body, SectionKind::People, 1, &mut errors);
1125        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1126    }
1127
1128    #[test]
1129    fn reject_invalid_enum_value() {
1130        let body = "### Test\n- role: wizard\n";
1131        let mut errors = Vec::new();
1132        parse_entities(body, SectionKind::People, 1, &mut errors);
1133        assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1134    }
1135
1136    #[test]
1137    fn accept_custom_enum_value() {
1138        let body = "### Test\n- role: custom:Kit Manager\n";
1139        let mut errors = Vec::new();
1140        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1141        assert!(errors.is_empty(), "errors: {errors:?}");
1142        assert_eq!(entities.len(), 1);
1143    }
1144
1145    #[test]
1146    fn normalize_enum_value_spaces_to_underscores() {
1147        let body = "### Test\n- role: civil servant\n";
1148        let mut errors = Vec::new();
1149        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1150        assert!(errors.is_empty(), "errors: {errors:?}");
1151        assert_eq!(entities.len(), 1);
1152        let val = entities[0]
1153            .fields
1154            .iter()
1155            .find(|(k, _)| k == "role")
1156            .map(|(_, v)| match v {
1157                FieldValue::Single(s) => s.as_str(),
1158                _ => "",
1159            });
1160        assert_eq!(val, Some("civil_servant"));
1161    }
1162
1163    #[test]
1164    fn normalize_enum_list_values() {
1165        let body = "### Test\n- role: civil servant, law enforcement\n";
1166        let mut errors = Vec::new();
1167        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1168        assert!(errors.is_empty(), "errors: {errors:?}");
1169        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1170        assert_eq!(
1171            roles.map(|(_, v)| v),
1172            Some(&FieldValue::List(vec![
1173                "civil_servant".into(),
1174                "law_enforcement".into(),
1175            ]))
1176        );
1177    }
1178
1179    #[test]
1180    fn reject_invalid_date_format() {
1181        let body = "### Test\n- date_of_birth: January 1990\n";
1182        let mut errors = Vec::new();
1183        parse_entities(body, SectionKind::People, 1, &mut errors);
1184        assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1185    }
1186
1187    #[test]
1188    fn accept_valid_date_formats() {
1189        for date in &["2024", "2024-01", "2024-01-15"] {
1190            let body = format!("### Test\n- date_of_birth: {date}\n");
1191            let mut errors = Vec::new();
1192            parse_entities(&body, SectionKind::People, 1, &mut errors);
1193            assert!(
1194                errors.is_empty(),
1195                "date {date:?} should be valid: {errors:?}"
1196            );
1197        }
1198    }
1199
1200    #[test]
1201    fn reject_non_https_url() {
1202        let body = "### Test\n- urls:\n  - http://example.com\n";
1203        let mut errors = Vec::new();
1204        parse_entities(body, SectionKind::People, 1, &mut errors);
1205        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1206    }
1207
1208    #[test]
1209    fn reject_non_https_thumbnail() {
1210        let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1211        let mut errors = Vec::new();
1212        parse_entities(body, SectionKind::People, 1, &mut errors);
1213        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1214    }
1215
1216    #[test]
1217    fn multiple_entities() {
1218        let body = [
1219            "",
1220            "### Alice",
1221            "- nationality: NL",
1222            "",
1223            "### Bob",
1224            "- nationality: GB",
1225            "",
1226        ]
1227        .join("\n");
1228
1229        let mut errors = Vec::new();
1230        let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1231        assert!(errors.is_empty(), "errors: {errors:?}");
1232        assert_eq!(entities.len(), 2);
1233        assert_eq!(entities[0].name, "Alice");
1234        assert_eq!(entities[1].name, "Bob");
1235    }
1236
1237    #[test]
1238    fn field_max_length_violation() {
1239        let long_val = "a".repeat(201);
1240        let body = format!("### Test\n- nationality: {long_val}\n");
1241        let mut errors = Vec::new();
1242        parse_entities(&body, SectionKind::People, 1, &mut errors);
1243        assert!(
1244            errors
1245                .iter()
1246                .any(|e| e.message.contains("exceeds 100 chars"))
1247        );
1248    }
1249
1250    #[test]
1251    fn too_many_aliases() {
1252        let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1253        let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1254        let mut errors = Vec::new();
1255        parse_entities(&body, SectionKind::People, 1, &mut errors);
1256        assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1257    }
1258
1259    #[test]
1260    fn require_org_type_for_organizations() {
1261        let body = "### Test Corp\n- qualifier: Test\n";
1262        let mut errors = Vec::new();
1263        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1264        assert!(
1265            errors
1266                .iter()
1267                .any(|e| { e.message.contains("missing required field \"org_type\"") })
1268        );
1269    }
1270
1271    #[test]
1272    fn accept_organization_with_type() {
1273        let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1274        let mut errors = Vec::new();
1275        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1276        assert!(errors.is_empty(), "errors: {errors:?}");
1277    }
1278}