Skip to main content

weave_content/
entity.rs

1use std::fmt;
2
3use crate::parser::{ParseError, SectionKind};
4
5/// Maximum entities per file.
6const MAX_ENTITIES_PER_FILE: usize = 50;
7
8/// Maximum length of an entity name.
9const MAX_NAME_LEN: usize = 300;
10
11/// Label derived from the section an entity appears in.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum Label {
14    Person,
15    Organization,
16    Event,
17    Document,
18    Asset,
19}
20
21impl fmt::Display for Label {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        match self {
24            Self::Person => write!(f, "person"),
25            Self::Organization => write!(f, "organization"),
26            Self::Event => write!(f, "event"),
27            Self::Document => write!(f, "document"),
28            Self::Asset => write!(f, "asset"),
29        }
30    }
31}
32
33impl Label {
34    pub fn from_section(kind: SectionKind) -> Option<Self> {
35        match kind {
36            SectionKind::People => Some(Self::Person),
37            SectionKind::Organizations => Some(Self::Organization),
38            SectionKind::Events => Some(Self::Event),
39            SectionKind::Documents => Some(Self::Document),
40            SectionKind::Assets => Some(Self::Asset),
41            _ => None,
42        }
43    }
44}
45
46/// A parsed entity with its name, label, and field map.
47#[derive(Debug, Clone)]
48pub struct Entity {
49    pub name: String,
50    pub label: Label,
51    pub fields: Vec<(String, FieldValue)>,
52    /// Stored NULID from `id:` field (None if not yet generated).
53    pub id: Option<String>,
54    /// Line number (1-indexed) of the H3 heading.
55    pub line: usize,
56    /// Tags from front matter (empty for inline entities).
57    pub tags: Vec<String>,
58    /// File-path slug (e.g. `people/id/harvey-moeis`). Only set for
59    /// registry entities that have standalone files.
60    pub slug: Option<String>,
61}
62
63/// A field value: either a single string or a list of strings.
64#[derive(Debug, Clone, PartialEq, Eq)]
65pub enum FieldValue {
66    Single(String),
67    List(Vec<String>),
68}
69
70/// Parse a single entity from a standalone entity file body.
71/// The body is the text after the H1 heading (bullet fields, no H3 headings).
72/// `label` is determined by the file's directory (people/ or organizations/).
73/// `id` comes from the front matter (may be None).
74pub fn parse_entity_file_body(
75    name: &str,
76    body: &str,
77    label: Label,
78    id: Option<String>,
79    title_line: usize,
80    errors: &mut Vec<ParseError>,
81) -> Entity {
82    let section_kind = match label {
83        Label::Person => SectionKind::People,
84        Label::Organization => SectionKind::Organizations,
85        Label::Event => SectionKind::Events,
86        Label::Document => SectionKind::Documents,
87        Label::Asset => SectionKind::Assets,
88    };
89
90    // Wrap the body with a fake H3 heading so we can reuse parse_entities
91    let wrapped = format!("### {name}\n{body}");
92    let mut entities = parse_entities(&wrapped, section_kind, title_line.saturating_sub(1), errors);
93
94    if let Some(mut entity) = entities.pop() {
95        entity.id = id;
96        entity.line = title_line;
97        entity
98    } else {
99        Entity {
100            name: name.to_string(),
101            label,
102            fields: Vec::new(),
103            id,
104            line: title_line,
105            tags: Vec::new(),
106            slug: None,
107        }
108    }
109}
110
111/// Parse entities from an entity section (People, Organizations, Events).
112/// The `body` is the text between the H2 heading and the next H2 heading.
113/// `section_start_line` is the line number of the H2 heading in the original file.
114#[allow(clippy::too_many_lines)]
115pub fn parse_entities(
116    body: &str,
117    section_kind: SectionKind,
118    section_start_line: usize,
119    errors: &mut Vec<ParseError>,
120) -> Vec<Entity> {
121    let Some(label) = Label::from_section(section_kind) else {
122        return Vec::new();
123    };
124
125    let lines: Vec<&str> = body.lines().collect();
126    let mut entities: Vec<Entity> = Vec::new();
127    let mut current_name: Option<String> = None;
128    let mut current_line: usize = 0;
129    let mut current_fields: Vec<(String, FieldValue)> = Vec::new();
130    // Track multi-line value continuation and nested list building
131    let mut pending_list_key: Option<String> = None;
132    let mut pending_list_items: Vec<String> = Vec::new();
133
134    for (i, line) in lines.iter().enumerate() {
135        let file_line = section_start_line + 1 + i; // +1 because body starts after the H2 heading line
136
137        // Check for H3 heading
138        if let Some(name) = strip_h3(line) {
139            // Flush pending list
140            flush_pending_list(
141                &mut pending_list_key,
142                &mut pending_list_items,
143                &mut current_fields,
144            );
145
146            // Flush previous entity
147            if let Some(entity_name) = current_name.take() {
148                let entity = build_entity(
149                    entity_name,
150                    label,
151                    current_line,
152                    &mut current_fields,
153                    errors,
154                );
155                entities.push(entity);
156            }
157
158            current_name = Some(name.to_string());
159            current_line = file_line;
160            current_fields.clear();
161            continue;
162        }
163
164        // Only parse bullet fields if we're inside an entity (after an H3)
165        if current_name.is_none() {
166            if !line.trim().is_empty() {
167                errors.push(ParseError {
168                    line: file_line,
169                    message: "content before first entity heading (### Name)".into(),
170                });
171            }
172            continue;
173        }
174
175        let trimmed = line.trim();
176
177        // Nested list item: `  - value` (2-space indent + dash)
178        if let Some(item) = trimmed.strip_prefix("- ") {
179            if line.starts_with("  - ") && pending_list_key.is_some() {
180                // Nested list item for pending list key
181                pending_list_items.push(item.trim().to_string());
182                continue;
183            }
184
185            // Flush pending list before processing new top-level bullet
186            flush_pending_list(
187                &mut pending_list_key,
188                &mut pending_list_items,
189                &mut current_fields,
190            );
191
192            // Top-level bullet: `- key: value` or `- key:`
193            if let Some((key, value)) = parse_bullet(item) {
194                if value.is_empty() {
195                    // Start a nested list: `- urls:`
196                    pending_list_key = Some(key);
197                    pending_list_items.clear();
198                } else if is_list_field(&key) && value.contains(',') {
199                    // Comma-separated list: `- aliases: A, B, C`
200                    let items: Vec<String> = value
201                        .split(',')
202                        .map(|s| s.trim().to_string())
203                        .filter(|s| !s.is_empty())
204                        .collect();
205                    current_fields.push((key, FieldValue::List(items)));
206                } else {
207                    current_fields.push((key, FieldValue::Single(value)));
208                }
209            } else {
210                errors.push(ParseError {
211                    line: file_line,
212                    message: format!(
213                        "invalid field syntax: expected `- key: value`, got {trimmed:?}"
214                    ),
215                });
216            }
217            continue;
218        }
219
220        // Multi-line value continuation (2-space indent, not a bullet)
221        if line.starts_with("  ") && !trimmed.is_empty() && !trimmed.starts_with('-') {
222            if pending_list_key.is_some() {
223                // Could be continuation inside a list context -- treat as error
224                errors.push(ParseError {
225                    line: file_line,
226                    message: "unexpected indented text in list context".into(),
227                });
228            } else if let Some(last) = current_fields.last_mut() {
229                match last.1 {
230                    FieldValue::Single(ref mut val) => {
231                        val.push('\n');
232                        val.push_str(trimmed);
233                    }
234                    FieldValue::List(ref mut items) => {
235                        // Continuation of a comma-separated list field.
236                        // Join last item with continuation text, then re-split
237                        // in case new commas appear.
238                        let tail = items.pop().unwrap_or_default();
239                        let joined = if tail.is_empty() {
240                            trimmed.to_string()
241                        } else {
242                            format!("{tail} {trimmed}")
243                        };
244                        for part in joined.split(',') {
245                            let part = part.trim().to_string();
246                            if !part.is_empty() {
247                                items.push(part);
248                            }
249                        }
250                    }
251                }
252            }
253            continue;
254        }
255
256        // Blank line or other content -- ignore
257        if !trimmed.is_empty() {
258            // Flush pending list on non-indented non-bullet content
259            flush_pending_list(
260                &mut pending_list_key,
261                &mut pending_list_items,
262                &mut current_fields,
263            );
264        }
265    }
266
267    // Flush final pending list and entity
268    flush_pending_list(
269        &mut pending_list_key,
270        &mut pending_list_items,
271        &mut current_fields,
272    );
273
274    if let Some(entity_name) = current_name.take() {
275        let entity = build_entity(
276            entity_name,
277            label,
278            current_line,
279            &mut current_fields,
280            errors,
281        );
282        entities.push(entity);
283    }
284
285    // Boundary check
286    if entities.len() > MAX_ENTITIES_PER_FILE {
287        errors.push(ParseError {
288            line: section_start_line,
289            message: format!(
290                "too many entities in section (max {MAX_ENTITIES_PER_FILE}, got {})",
291                entities.len()
292            ),
293        });
294    }
295
296    entities
297}
298
299fn flush_pending_list(
300    pending_key: &mut Option<String>,
301    pending_items: &mut Vec<String>,
302    fields: &mut Vec<(String, FieldValue)>,
303) {
304    if let Some(key) = pending_key.take() {
305        fields.push((key, FieldValue::List(std::mem::take(pending_items))));
306    }
307}
308
309fn build_entity(
310    name: String,
311    label: Label,
312    line: usize,
313    fields: &mut Vec<(String, FieldValue)>,
314    errors: &mut Vec<ParseError>,
315) -> Entity {
316    // Validate name
317    if name.trim().is_empty() {
318        errors.push(ParseError {
319            line,
320            message: "entity name must not be empty".into(),
321        });
322    } else if name.len() > MAX_NAME_LEN {
323        errors.push(ParseError {
324            line,
325            message: format!(
326                "entity name exceeds {MAX_NAME_LEN} chars (got {})",
327                name.len()
328            ),
329        });
330    }
331
332    // Extract id field before validation (not a schema field)
333    let id = extract_id_field(fields);
334
335    // Apply type: shorthand
336    apply_type_shorthand(fields, label);
337
338    // Normalize enum field values (lowercase, spaces → underscores)
339    normalize_enum_fields(fields);
340
341    // Validate fields against schema
342    validate_fields(fields, label, line, errors);
343
344    Entity {
345        name,
346        label,
347        fields: std::mem::take(fields),
348        id,
349        line,
350        tags: Vec::new(),
351        slug: None,
352    }
353}
354
355/// Extract and remove the `id` field from the field list.
356fn extract_id_field(fields: &mut Vec<(String, FieldValue)>) -> Option<String> {
357    let pos = fields.iter().position(|(k, _)| k == "id")?;
358    let (_, value) = fields.remove(pos);
359    match value {
360        FieldValue::Single(s) if !s.is_empty() => Some(s),
361        _ => None,
362    }
363}
364
365/// Replace `type:` shorthand with the label-specific field name.
366fn apply_type_shorthand(fields: &mut [(String, FieldValue)], label: Label) {
367    for field in fields.iter_mut() {
368        if field.0 == "type" {
369            field.0 = match label {
370                Label::Organization => "org_type".to_string(),
371                Label::Event => "event_type".to_string(),
372                Label::Document => "doc_type".to_string(),
373                Label::Asset => "asset_type".to_string(),
374                Label::Person => "type".to_string(), // will be caught as unknown
375            };
376        }
377    }
378}
379
380/// Parse `key: value` from a bullet item (after stripping `- `).
381fn parse_bullet(item: &str) -> Option<(String, String)> {
382    let colon_pos = item.find(':')?;
383    let key = item[..colon_pos].trim();
384    if key.is_empty() {
385        return None;
386    }
387    let value = item[colon_pos + 1..].trim();
388    Some((key.to_string(), value.to_string()))
389}
390
391/// Check if a field name is a list-type field.
392fn is_list_field(key: &str) -> bool {
393    matches!(key, "aliases" | "urls" | "role")
394}
395
396/// Strip an H3 heading prefix. Returns the heading text.
397fn strip_h3(line: &str) -> Option<&str> {
398    let trimmed = line.trim_start();
399    if let Some(rest) = trimmed.strip_prefix("### ") {
400        // Must not be H4+
401        if !rest.starts_with('#') {
402            return Some(rest.trim());
403        }
404    }
405    None
406}
407
408// --- Field validation ---
409
410/// Known fields per label (common + label-specific).
411const COMMON_FIELDS: &[&str] = &[
412    "qualifier",
413    "aliases",
414    "thumbnail",
415    "thumbnail_source",
416    "urls",
417    "description",
418];
419
420const PERSON_FIELDS: &[&str] = &[
421    "role",
422    "nationality",
423    "date_of_birth",
424    "place_of_birth",
425    "status",
426];
427
428const ORGANIZATION_FIELDS: &[&str] = &[
429    "org_type",
430    "jurisdiction",
431    "headquarters",
432    "founded_date",
433    "registration_number",
434    "status",
435];
436
437const EVENT_FIELDS: &[&str] = &["event_type", "occurred_at", "jurisdiction", "severity"];
438
439const DOCUMENT_FIELDS: &[&str] = &["doc_type", "issued_at", "issuing_authority", "case_number"];
440
441const ASSET_FIELDS: &[&str] = &["asset_type", "value", "status"];
442
443/// Known enum values — delegated to domain module constants.
444use crate::domain;
445
446const ROLE_VALUES: &[&str] = domain::Role::KNOWN;
447const ORG_TYPE_VALUES: &[&str] = domain::OrgType::KNOWN;
448const EVENT_TYPE_VALUES: &[&str] = domain::EventType::KNOWN;
449const DOC_TYPE_VALUES: &[&str] = domain::DocType::KNOWN;
450const ASSET_TYPE_VALUES: &[&str] = domain::AssetType::KNOWN;
451const SEVERITY_VALUES: &[&str] = domain::Severity::KNOWN;
452const PERSON_STATUS_VALUES: &[&str] = domain::PersonStatus::KNOWN;
453const ORG_STATUS_VALUES: &[&str] = domain::OrgStatus::KNOWN;
454const ASSET_STATUS_VALUES: &[&str] = domain::AssetStatus::KNOWN;
455
456/// Field max lengths.
457struct FieldConstraint {
458    max_len: usize,
459    /// If Some, the field is an enum with these known values.
460    enum_values: Option<&'static [&'static str]>,
461}
462
463fn field_constraint(key: &str) -> Option<FieldConstraint> {
464    match key {
465        "description" => Some(FieldConstraint {
466            max_len: 2000,
467            enum_values: None,
468        }),
469        "thumbnail" | "thumbnail_source" => Some(FieldConstraint {
470            max_len: 2048,
471            enum_values: None,
472        }),
473        "occurred_at" | "date_of_birth" | "founded_date" | "issued_at" | "opened_at"
474        | "closed_at" => Some(FieldConstraint {
475            max_len: 10,
476            enum_values: None,
477        }),
478        "place_of_birth" | "headquarters" | "issuing_authority" | "value" => {
479            Some(FieldConstraint {
480                max_len: 200,
481                enum_values: None,
482            })
483        }
484        "jurisdiction" => Some(FieldConstraint {
485            // jurisdiction: ID or ID/South Sulawesi (country + optional subdivision)
486            max_len: 203, // 2 + 1 + 200
487            enum_values: None,
488        }),
489        "role" => Some(FieldConstraint {
490            max_len: 100,
491            enum_values: Some(ROLE_VALUES),
492        }),
493        "org_type" => Some(FieldConstraint {
494            max_len: 100,
495            enum_values: Some(ORG_TYPE_VALUES),
496        }),
497        "event_type" => Some(FieldConstraint {
498            max_len: 100,
499            enum_values: Some(EVENT_TYPE_VALUES),
500        }),
501        "doc_type" => Some(FieldConstraint {
502            max_len: 100,
503            enum_values: Some(DOC_TYPE_VALUES),
504        }),
505        "asset_type" => Some(FieldConstraint {
506            max_len: 100,
507            enum_values: Some(ASSET_TYPE_VALUES),
508        }),
509        "severity" => Some(FieldConstraint {
510            max_len: 20,
511            enum_values: Some(SEVERITY_VALUES),
512        }),
513        "status" => Some(FieldConstraint {
514            // Status validation is context-dependent (Person vs Org vs Asset),
515            // handled separately in validate_fields.
516            max_len: 30,
517            enum_values: None,
518        }),
519        "qualifier" | "nationality" | "case_number" | "registration_number" => {
520            Some(FieldConstraint {
521                max_len: 100,
522                enum_values: None,
523            })
524        }
525        // List fields validated separately
526        _ => None,
527    }
528}
529
530/// Maximum items in list fields.
531const MAX_ALIASES: usize = 10;
532const MAX_ALIAS_LEN: usize = 200;
533const MAX_URLS: usize = 10;
534const MAX_URL_LEN: usize = 2048;
535
536/// Normalize enum field values in-place: lowercase and replace spaces with
537/// underscores. Values with the `custom:` prefix are left unchanged.
538/// Handles both single-value and list-value enum fields.
539fn normalize_enum_fields(fields: &mut [(String, FieldValue)]) {
540    for (key, value) in fields.iter_mut() {
541        let is_enum = field_constraint(key).and_then(|c| c.enum_values).is_some();
542
543        match value {
544            FieldValue::Single(val) if is_enum && !val.starts_with("custom:") => {
545                let normalized = val.to_lowercase().replace(' ', "_");
546                if normalized != *val {
547                    *val = normalized;
548                }
549            }
550            FieldValue::List(items) if is_enum => {
551                for item in items.iter_mut() {
552                    if !item.starts_with("custom:") {
553                        let normalized = item.to_lowercase().replace(' ', "_");
554                        if normalized != *item {
555                            *item = normalized;
556                        }
557                    }
558                }
559            }
560            _ => {}
561        }
562    }
563}
564
565#[allow(clippy::too_many_lines)]
566fn validate_fields(
567    fields: &[(String, FieldValue)],
568    label: Label,
569    line: usize,
570    errors: &mut Vec<ParseError>,
571) {
572    let label_fields: &[&str] = match label {
573        Label::Person => PERSON_FIELDS,
574        Label::Organization => ORGANIZATION_FIELDS,
575        Label::Event => EVENT_FIELDS,
576        Label::Document => DOCUMENT_FIELDS,
577        Label::Asset => ASSET_FIELDS,
578    };
579
580    for (key, value) in fields {
581        // Check if field is known
582        if !COMMON_FIELDS.contains(&key.as_str()) && !label_fields.contains(&key.as_str()) {
583            errors.push(ParseError {
584                line,
585                message: format!("unknown field {key:?} for {label}"),
586            });
587            continue;
588        }
589
590        match value {
591            FieldValue::Single(val) => {
592                if let Some(constraint) = field_constraint(key) {
593                    if val.len() > constraint.max_len {
594                        errors.push(ParseError {
595                            line,
596                            message: format!(
597                                "field {key:?} exceeds {} chars (got {})",
598                                constraint.max_len,
599                                val.len()
600                            ),
601                        });
602                    }
603
604                    // Validate enum values
605                    if let Some(allowed) = constraint.enum_values {
606                        validate_enum_value(key, val, allowed, line, errors);
607                    }
608
609                    // Validate date format
610                    if matches!(
611                        key.as_str(),
612                        "occurred_at"
613                            | "date_of_birth"
614                            | "founded_date"
615                            | "issued_at"
616                            | "opened_at"
617                            | "closed_at"
618                    ) && !val.is_empty()
619                    {
620                        validate_date_format(key, val, line, errors);
621                    }
622
623                    // Validate URL fields
624                    if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
625                        && !val.is_empty()
626                        && !val.starts_with("https://")
627                    {
628                        errors.push(ParseError {
629                            line,
630                            message: format!("field {key:?} must be HTTPS URL"),
631                        });
632                    }
633                }
634
635                // Context-dependent status validation
636                if key == "status" {
637                    validate_status(val, label, line, errors);
638                }
639
640                // Validate jurisdiction format: `XX` or `XX/Subdivision`
641                if key == "jurisdiction" && !val.is_empty() {
642                    validate_jurisdiction(val, line, errors);
643                }
644
645                // Validate money format: `amount currency "display"`
646                if key == "value" && !val.is_empty() {
647                    validate_money(val, line, errors);
648                }
649            }
650            FieldValue::List(items) => match key.as_str() {
651                "aliases" => {
652                    if items.len() > MAX_ALIASES {
653                        errors.push(ParseError {
654                            line,
655                            message: format!(
656                                "aliases exceeds {MAX_ALIASES} items (got {})",
657                                items.len()
658                            ),
659                        });
660                    }
661                    for item in items {
662                        if item.len() > MAX_ALIAS_LEN {
663                            errors.push(ParseError {
664                                line,
665                                message: format!("alias exceeds {MAX_ALIAS_LEN} chars: {item:?}"),
666                            });
667                        }
668                    }
669                }
670                "urls" => {
671                    if items.len() > MAX_URLS {
672                        errors.push(ParseError {
673                            line,
674                            message: format!("urls exceeds {MAX_URLS} items (got {})", items.len()),
675                        });
676                    }
677                    for item in items {
678                        if item.len() > MAX_URL_LEN {
679                            errors.push(ParseError {
680                                line,
681                                message: format!("url exceeds {MAX_URL_LEN} chars: {item:?}"),
682                            });
683                        }
684                        if !item.starts_with("https://") {
685                            errors.push(ParseError {
686                                line,
687                                message: format!("url must be HTTPS: {item:?}"),
688                            });
689                        }
690                    }
691                }
692                "role" => {
693                    if items.len() > MAX_ROLES {
694                        errors.push(ParseError {
695                            line,
696                            message: format!(
697                                "role exceeds {MAX_ROLES} items (got {})",
698                                items.len()
699                            ),
700                        });
701                    }
702                    for item in items {
703                        validate_enum_value("role", item, ROLE_VALUES, line, errors);
704                    }
705                }
706                _ => {}
707            },
708        }
709    }
710
711    // Required field checks
712    if label == Label::Organization && !fields.iter().any(|(k, _)| k == "org_type") {
713        errors.push(ParseError {
714            line,
715            message: "organization entity missing required field \"org_type\"".into(),
716        });
717    }
718}
719
720/// Maximum roles per person.
721const MAX_ROLES: usize = 10;
722
723/// Validate status value based on entity label context.
724fn validate_status(value: &str, label: Label, line: usize, errors: &mut Vec<ParseError>) {
725    let allowed: &[&str] = match label {
726        Label::Person => PERSON_STATUS_VALUES,
727        Label::Organization => ORG_STATUS_VALUES,
728        Label::Asset => ASSET_STATUS_VALUES,
729        _ => {
730            errors.push(ParseError {
731                line,
732                message: format!("field \"status\" is not valid for {label}"),
733            });
734            return;
735        }
736    };
737
738    let normalized = value.to_lowercase().replace(' ', "_");
739    if !allowed.contains(&normalized.as_str()) {
740        errors.push(ParseError {
741            line,
742            message: format!(
743                "invalid status {value:?} for {label} (known: {})",
744                allowed.join(", ")
745            ),
746        });
747    }
748}
749
750/// Validate jurisdiction format: `XX` or `XX/Subdivision`.
751fn validate_jurisdiction(value: &str, line: usize, errors: &mut Vec<ParseError>) {
752    if let Some(slash_pos) = value.find('/') {
753        let country = &value[..slash_pos];
754        let subdivision = &value[slash_pos + 1..];
755        if country.len() != 2 || !country.chars().all(|c| c.is_ascii_uppercase()) {
756            errors.push(ParseError {
757                line,
758                message: format!(
759                    "jurisdiction country must be 2-letter uppercase ISO code, got {country:?}"
760                ),
761            });
762        }
763        if subdivision.is_empty() || subdivision.len() > domain::MAX_SUBDIVISION_LEN {
764            errors.push(ParseError {
765                line,
766                message: format!(
767                    "jurisdiction subdivision must be 1-{} chars",
768                    domain::MAX_SUBDIVISION_LEN
769                ),
770            });
771        }
772    } else {
773        // Just country code
774        if value.len() != 2 || !value.chars().all(|c| c.is_ascii_uppercase()) {
775            errors.push(ParseError {
776                line,
777                message: format!(
778                    "jurisdiction must be 2-letter uppercase ISO code or CODE/Subdivision, got {value:?}"
779                ),
780            });
781        }
782    }
783}
784
785/// Validate money DSL format: `amount currency "display"`.
786/// Example: `500000000000 IDR "Rp 500 billion"`
787fn validate_money(value: &str, line: usize, errors: &mut Vec<ParseError>) {
788    // Split: amount currency "display"
789    let parts: Vec<&str> = value.splitn(3, ' ').collect();
790    if parts.len() < 3 {
791        errors.push(ParseError {
792            line,
793            message: format!(
794                "invalid money format: expected `amount currency \"display\"`, got {value:?}"
795            ),
796        });
797        return;
798    }
799
800    // Validate amount is a valid integer
801    if parts[0].parse::<i64>().is_err() {
802        errors.push(ParseError {
803            line,
804            message: format!("money amount must be an integer, got {:?}", parts[0]),
805        });
806    }
807
808    // Validate currency is 3-letter uppercase
809    let currency = parts[1];
810    if currency.len() != 3 || !currency.chars().all(|c| c.is_ascii_uppercase()) {
811        errors.push(ParseError {
812            line,
813            message: format!(
814                "money currency must be 3-letter uppercase ISO code, got {currency:?}"
815            ),
816        });
817    }
818
819    // Validate display is quoted
820    let display = parts[2];
821    if !display.starts_with('"') || !display.ends_with('"') {
822        errors.push(ParseError {
823            line,
824            message: format!("money display must be quoted, got {display:?}"),
825        });
826    } else {
827        let inner = &display[1..display.len() - 1];
828        if inner.len() > domain::MAX_MONEY_DISPLAY_LEN {
829            errors.push(ParseError {
830                line,
831                message: format!(
832                    "money display exceeds {} chars (got {})",
833                    domain::MAX_MONEY_DISPLAY_LEN,
834                    inner.len()
835                ),
836            });
837        }
838    }
839}
840
841fn validate_enum_value(
842    key: &str,
843    value: &str,
844    allowed: &[&str],
845    line: usize,
846    errors: &mut Vec<ParseError>,
847) {
848    // custom: prefix is always valid (if non-empty after prefix, max 100 chars)
849    if let Some(custom) = value.strip_prefix("custom:") {
850        if custom.is_empty() || custom.len() > 100 {
851            errors.push(ParseError {
852                line,
853                message: format!(
854                    "field {key:?} custom value must be 1-100 chars, got {}",
855                    custom.len()
856                ),
857            });
858        }
859        return;
860    }
861
862    let normalized = value.to_lowercase().replace(' ', "_");
863    if !allowed.contains(&normalized.as_str()) {
864        errors.push(ParseError {
865            line,
866            message: format!(
867                "invalid {key} value {value:?} (known: {}; use \"custom:Value\" for custom)",
868                allowed.join(", ")
869            ),
870        });
871    }
872}
873
874fn validate_date_format(key: &str, value: &str, line: usize, errors: &mut Vec<ParseError>) {
875    // Valid formats: YYYY, YYYY-MM, YYYY-MM-DD
876    let valid = matches!(value.len(), 4 | 7 | 10)
877        && value.chars().enumerate().all(|(i, c)| match i {
878            4 | 7 => c == '-',
879            _ => c.is_ascii_digit(),
880        });
881
882    if !valid {
883        errors.push(ParseError {
884            line,
885            message: format!("field {key:?} must be YYYY, YYYY-MM, or YYYY-MM-DD, got {value:?}"),
886        });
887    }
888}
889
890#[cfg(test)]
891mod tests {
892    use super::*;
893
894    #[test]
895    fn parse_person_entity() {
896        let body = [
897            "",
898            "### Mark Bonnick",
899            "- qualifier: Arsenal Kit Manager",
900            "- nationality: GB",
901            "- role: custom:Kit Manager",
902            "- date_of_birth: 1962",
903            "- description: Academy kit manager at Arsenal FC for 22 years",
904            "  (2001-2024). Age 62 at time of dismissal.",
905            "",
906        ]
907        .join("\n");
908
909        let mut errors = Vec::new();
910        let entities = parse_entities(&body, SectionKind::People, 10, &mut errors);
911        assert!(errors.is_empty(), "errors: {errors:?}");
912        assert_eq!(entities.len(), 1);
913
914        let e = &entities[0];
915        assert_eq!(e.name, "Mark Bonnick");
916        assert_eq!(e.label, Label::Person);
917        assert_eq!(e.fields.len(), 5);
918
919        // Check multi-line description
920        let desc = e
921            .fields
922            .iter()
923            .find(|(k, _)| k == "description")
924            .map(|(_, v)| v);
925        assert_eq!(
926            desc,
927            Some(&FieldValue::Single(
928                "Academy kit manager at Arsenal FC for 22 years\n(2001-2024). Age 62 at time of dismissal.".into()
929            ))
930        );
931    }
932
933    #[test]
934    fn parse_person_with_role_list() {
935        let body = "### Test\n- role: politician, executive\n";
936        let mut errors = Vec::new();
937        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
938        assert!(errors.is_empty(), "errors: {errors:?}");
939        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
940        assert_eq!(
941            roles.map(|(_, v)| v),
942            Some(&FieldValue::List(vec![
943                "politician".into(),
944                "executive".into(),
945            ]))
946        );
947    }
948
949    #[test]
950    fn parse_person_with_status() {
951        let body = "### Test\n- status: imprisoned\n";
952        let mut errors = Vec::new();
953        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
954        assert!(errors.is_empty(), "errors: {errors:?}");
955    }
956
957    #[test]
958    fn reject_invalid_person_status() {
959        let body = "### Test\n- status: unknown_status\n";
960        let mut errors = Vec::new();
961        parse_entities(body, SectionKind::People, 1, &mut errors);
962        assert!(errors.iter().any(|e| e.message.contains("invalid status")));
963    }
964
965    #[test]
966    fn parse_organization_with_type_shorthand() {
967        let body = [
968            "",
969            "### Arsenal FC",
970            "- type: sports_club",
971            "- jurisdiction: GB",
972            "- aliases: Arsenal, The Gunners, Arsenal Football Club",
973            "- urls:",
974            "  - https://www.arsenal.com",
975            "  - https://en.wikipedia.org/wiki/Arsenal_F.C.",
976            "",
977        ]
978        .join("\n");
979
980        let mut errors = Vec::new();
981        let entities = parse_entities(&body, SectionKind::Organizations, 20, &mut errors);
982        assert!(errors.is_empty(), "errors: {errors:?}");
983        assert_eq!(entities.len(), 1);
984
985        let e = &entities[0];
986        assert_eq!(e.name, "Arsenal FC");
987        assert_eq!(e.label, Label::Organization);
988
989        // type: should have been expanded to org_type:
990        let it = e.fields.iter().find(|(k, _)| k == "org_type");
991        assert_eq!(
992            it.map(|(_, v)| v),
993            Some(&FieldValue::Single("sports_club".into()))
994        );
995
996        // aliases as comma-separated
997        let aliases = e.fields.iter().find(|(k, _)| k == "aliases");
998        assert_eq!(
999            aliases.map(|(_, v)| v),
1000            Some(&FieldValue::List(vec![
1001                "Arsenal".into(),
1002                "The Gunners".into(),
1003                "Arsenal Football Club".into(),
1004            ]))
1005        );
1006
1007        // urls as nested list
1008        let urls = e.fields.iter().find(|(k, _)| k == "urls");
1009        assert_eq!(
1010            urls.map(|(_, v)| v),
1011            Some(&FieldValue::List(vec![
1012                "https://www.arsenal.com".into(),
1013                "https://en.wikipedia.org/wiki/Arsenal_F.C.".into(),
1014            ]))
1015        );
1016    }
1017
1018    #[test]
1019    fn parse_organization_with_jurisdiction_subdivision() {
1020        let body = "### Pemkab Bogor\n- org_type: local_government\n- jurisdiction: ID/West Java\n";
1021        let mut errors = Vec::new();
1022        let entities = parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1023        assert!(errors.is_empty(), "errors: {errors:?}");
1024        let j = entities[0].fields.iter().find(|(k, _)| k == "jurisdiction");
1025        assert_eq!(
1026            j.map(|(_, v)| v),
1027            Some(&FieldValue::Single("ID/West Java".into()))
1028        );
1029    }
1030
1031    #[test]
1032    fn reject_invalid_jurisdiction() {
1033        let body = "### Test\n- org_type: corporation\n- jurisdiction: England\n";
1034        let mut errors = Vec::new();
1035        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1036        assert!(errors.iter().any(|e| e.message.contains("jurisdiction")));
1037    }
1038
1039    #[test]
1040    fn parse_event_with_type_shorthand() {
1041        let body = [
1042            "",
1043            "### Bonnick dismissal",
1044            "- occurred_at: 2024-12-24",
1045            "- type: dismissal",
1046            "- description: Arsenal dismisses Bonnick.",
1047            "",
1048        ]
1049        .join("\n");
1050
1051        let mut errors = Vec::new();
1052        let entities = parse_entities(&body, SectionKind::Events, 50, &mut errors);
1053        assert!(errors.is_empty(), "errors: {errors:?}");
1054
1055        let e = &entities[0];
1056        assert_eq!(e.label, Label::Event);
1057        let dt = e.fields.iter().find(|(k, _)| k == "event_type");
1058        assert_eq!(
1059            dt.map(|(_, v)| v),
1060            Some(&FieldValue::Single("dismissal".into()))
1061        );
1062    }
1063
1064    #[test]
1065    fn parse_event_with_severity() {
1066        let body =
1067            "### Test event\n- event_type: bribery\n- severity: major\n- occurred_at: 2024-01-01\n";
1068        let mut errors = Vec::new();
1069        let entities = parse_entities(body, SectionKind::Events, 1, &mut errors);
1070        assert!(errors.is_empty(), "errors: {errors:?}");
1071    }
1072
1073    #[test]
1074    fn parse_document_entity() {
1075        let body = [
1076            "### Indictment No. 123",
1077            "- doc_type: indictment",
1078            "- issued_at: 2024-03-15",
1079            "- issuing_authority: Jakarta District Court",
1080            "- case_number: 123/Pid.B/2024/PN.Jkt.Pst",
1081        ]
1082        .join("\n");
1083        let mut errors = Vec::new();
1084        let entities = parse_entities(&body, SectionKind::Documents, 1, &mut errors);
1085        assert!(errors.is_empty(), "errors: {errors:?}");
1086        assert_eq!(entities.len(), 1);
1087        assert_eq!(entities[0].label, Label::Document);
1088    }
1089
1090    #[test]
1091    fn parse_asset_entity() {
1092        let body = "### Bribe payment\n- asset_type: cash\n- value: 500000000000 IDR \"Rp 500 billion\"\n- status: seized\n";
1093        let mut errors = Vec::new();
1094        let entities = parse_entities(body, SectionKind::Assets, 1, &mut errors);
1095        assert!(errors.is_empty(), "errors: {errors:?}");
1096        assert_eq!(entities.len(), 1);
1097        assert_eq!(entities[0].label, Label::Asset);
1098    }
1099
1100    #[test]
1101    fn reject_invalid_money_format() {
1102        let body = "### Test\n- asset_type: cash\n- value: lots of money\n";
1103        let mut errors = Vec::new();
1104        parse_entities(body, SectionKind::Assets, 1, &mut errors);
1105        assert!(errors.iter().any(|e| e.message.contains("money")));
1106    }
1107
1108    #[test]
1109    fn reject_unknown_field() {
1110        let body = "### Test\n- foobar: value\n";
1111        let mut errors = Vec::new();
1112        parse_entities(body, SectionKind::People, 1, &mut errors);
1113        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1114    }
1115
1116    #[test]
1117    fn reject_wrong_label_field() {
1118        // org_type on a person
1119        let body = "### Test\n- org_type: court\n";
1120        let mut errors = Vec::new();
1121        parse_entities(body, SectionKind::People, 1, &mut errors);
1122        assert!(errors.iter().any(|e| e.message.contains("unknown field")));
1123    }
1124
1125    #[test]
1126    fn reject_invalid_enum_value() {
1127        let body = "### Test\n- role: wizard\n";
1128        let mut errors = Vec::new();
1129        parse_entities(body, SectionKind::People, 1, &mut errors);
1130        assert!(errors.iter().any(|e| e.message.contains("invalid role")));
1131    }
1132
1133    #[test]
1134    fn accept_custom_enum_value() {
1135        let body = "### Test\n- role: custom:Kit Manager\n";
1136        let mut errors = Vec::new();
1137        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1138        assert!(errors.is_empty(), "errors: {errors:?}");
1139        assert_eq!(entities.len(), 1);
1140    }
1141
1142    #[test]
1143    fn normalize_enum_value_spaces_to_underscores() {
1144        let body = "### Test\n- role: civil servant\n";
1145        let mut errors = Vec::new();
1146        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1147        assert!(errors.is_empty(), "errors: {errors:?}");
1148        assert_eq!(entities.len(), 1);
1149        let val = entities[0]
1150            .fields
1151            .iter()
1152            .find(|(k, _)| k == "role")
1153            .map(|(_, v)| match v {
1154                FieldValue::Single(s) => s.as_str(),
1155                _ => "",
1156            });
1157        assert_eq!(val, Some("civil_servant"));
1158    }
1159
1160    #[test]
1161    fn normalize_enum_list_values() {
1162        let body = "### Test\n- role: civil servant, law enforcement\n";
1163        let mut errors = Vec::new();
1164        let entities = parse_entities(body, SectionKind::People, 1, &mut errors);
1165        assert!(errors.is_empty(), "errors: {errors:?}");
1166        let roles = entities[0].fields.iter().find(|(k, _)| k == "role");
1167        assert_eq!(
1168            roles.map(|(_, v)| v),
1169            Some(&FieldValue::List(vec![
1170                "civil_servant".into(),
1171                "law_enforcement".into(),
1172            ]))
1173        );
1174    }
1175
1176    #[test]
1177    fn reject_invalid_date_format() {
1178        let body = "### Test\n- date_of_birth: January 1990\n";
1179        let mut errors = Vec::new();
1180        parse_entities(body, SectionKind::People, 1, &mut errors);
1181        assert!(errors.iter().any(|e| e.message.contains("YYYY")));
1182    }
1183
1184    #[test]
1185    fn accept_valid_date_formats() {
1186        for date in &["2024", "2024-01", "2024-01-15"] {
1187            let body = format!("### Test\n- date_of_birth: {date}\n");
1188            let mut errors = Vec::new();
1189            parse_entities(&body, SectionKind::People, 1, &mut errors);
1190            assert!(
1191                errors.is_empty(),
1192                "date {date:?} should be valid: {errors:?}"
1193            );
1194        }
1195    }
1196
1197    #[test]
1198    fn reject_non_https_url() {
1199        let body = "### Test\n- urls:\n  - http://example.com\n";
1200        let mut errors = Vec::new();
1201        parse_entities(body, SectionKind::People, 1, &mut errors);
1202        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1203    }
1204
1205    #[test]
1206    fn reject_non_https_thumbnail() {
1207        let body = "### Test\n- thumbnail: http://example.com/img.jpg\n";
1208        let mut errors = Vec::new();
1209        parse_entities(body, SectionKind::People, 1, &mut errors);
1210        assert!(errors.iter().any(|e| e.message.contains("HTTPS")));
1211    }
1212
1213    #[test]
1214    fn multiple_entities() {
1215        let body = [
1216            "",
1217            "### Alice",
1218            "- nationality: NL",
1219            "",
1220            "### Bob",
1221            "- nationality: GB",
1222            "",
1223        ]
1224        .join("\n");
1225
1226        let mut errors = Vec::new();
1227        let entities = parse_entities(&body, SectionKind::People, 1, &mut errors);
1228        assert!(errors.is_empty(), "errors: {errors:?}");
1229        assert_eq!(entities.len(), 2);
1230        assert_eq!(entities[0].name, "Alice");
1231        assert_eq!(entities[1].name, "Bob");
1232    }
1233
1234    #[test]
1235    fn field_max_length_violation() {
1236        let long_val = "a".repeat(201);
1237        let body = format!("### Test\n- nationality: {long_val}\n");
1238        let mut errors = Vec::new();
1239        parse_entities(&body, SectionKind::People, 1, &mut errors);
1240        assert!(
1241            errors
1242                .iter()
1243                .any(|e| e.message.contains("exceeds 100 chars"))
1244        );
1245    }
1246
1247    #[test]
1248    fn too_many_aliases() {
1249        let aliases: Vec<String> = (0..11).map(|i| format!("Alias{i}")).collect();
1250        let body = format!("### Test\n- aliases: {}\n", aliases.join(", "));
1251        let mut errors = Vec::new();
1252        parse_entities(&body, SectionKind::People, 1, &mut errors);
1253        assert!(errors.iter().any(|e| e.message.contains("exceeds 10")));
1254    }
1255
1256    #[test]
1257    fn require_org_type_for_organizations() {
1258        let body = "### Test Corp\n- qualifier: Test\n";
1259        let mut errors = Vec::new();
1260        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1261        assert!(
1262            errors
1263                .iter()
1264                .any(|e| { e.message.contains("missing required field \"org_type\"") })
1265        );
1266    }
1267
1268    #[test]
1269    fn accept_organization_with_type() {
1270        let body = "### Test Corp\n- qualifier: Test\n- org_type: corporation\n";
1271        let mut errors = Vec::new();
1272        parse_entities(body, SectionKind::Organizations, 1, &mut errors);
1273        assert!(errors.is_empty(), "errors: {errors:?}");
1274    }
1275}