Skip to main content

ftm_types/
schema.rs

1//! FTM schema parser
2//!
3//! Parses YAML schemas from FollowTheMoney and resolves inheritance chains.
4
5use anyhow::{Context, Result};
6use serde::{Deserialize, Serialize};
7use std::collections::{HashMap, HashSet};
8use std::fs;
9use std::path::Path;
10
11/// FTM schema definition from YAML
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct FtmSchema {
14    #[serde(default)]
15    pub label: Option<String>,
16    #[serde(default)]
17    pub plural: Option<String>,
18    #[serde(default)]
19    pub description: Option<String>,
20    #[serde(default)]
21    pub extends: Option<Vec<String>>,
22    #[serde(rename = "abstract", default)]
23    pub abstract_: Option<bool>,
24    #[serde(default)]
25    pub matchable: Option<bool>,
26    #[serde(default)]
27    pub featured: Option<Vec<String>>,
28    #[serde(default)]
29    pub required: Option<Vec<String>>,
30    #[serde(default)]
31    pub caption: Option<Vec<String>>,
32    #[serde(default)]
33    pub properties: HashMap<String, FtmProperty>,
34    // Ignore other fields we don't need
35    #[serde(flatten)]
36    pub _extra: HashMap<String, serde_json::Value>,
37}
38
39/// Reverse property definition for bidirectional relationships
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct ReverseProperty {
42    pub name: String,
43    pub label: String,
44}
45
46/// FTM property definition
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct FtmProperty {
49    pub label: Option<String>,
50    #[serde(rename = "type", default)]
51    pub type_: Option<String>,
52    #[serde(default)]
53    pub description: Option<String>,
54    #[serde(default)]
55    pub reverse: Option<ReverseProperty>,
56    #[serde(default)]
57    pub range: Option<String>,
58    #[serde(default)]
59    pub stub: Option<bool>,
60    #[serde(default)]
61    pub deprecated: Option<bool>,
62    // Ignore other fields we don't need
63    #[serde(flatten)]
64    pub _extra: HashMap<String, serde_json::Value>,
65}
66
67/// Schema registry that manages all loaded schemas
68pub struct SchemaRegistry {
69    schemas: HashMap<String, FtmSchema>,
70}
71
72impl SchemaRegistry {
73    /// Load all schemas from a cache directory
74    pub fn load_from_cache<P: AsRef<Path>>(cache_dir: P) -> Result<Self> {
75        let cache_path = cache_dir.as_ref();
76
77        if !cache_path.exists() {
78            anyhow::bail!("Cache directory does not exist: {:?}", cache_path);
79        }
80
81        let mut schemas = HashMap::new();
82
83        // Read all YAML files (both .yml and .yaml extensions)
84        for entry in fs::read_dir(cache_path)? {
85            let entry = entry?;
86            let path = entry.path();
87
88            let ext = path.extension().and_then(|s| s.to_str());
89            if ext == Some("yml") || ext == Some("yaml") {
90                let schema_name = path
91                    .file_stem()
92                    .and_then(|s| s.to_str())
93                    .context("Invalid schema filename")?
94                    .to_string();
95
96                let content = fs::read_to_string(&path).context(format!(
97                    "Failed to read schema file: {} at {:?}",
98                    schema_name, path
99                ))?;
100
101                // Try parsing as wrapped format first (FTM official format)
102                // where schema name is root key: "Person: { label: Person, ... }"
103                let schema = if let Ok(root) =
104                    serde_yaml::from_str::<HashMap<String, FtmSchema>>(&content)
105                {
106                    // Extract the schema (should only have one key)
107                    root.into_iter()
108                        .next()
109                        .map(|(_, schema)| schema)
110                        .context(format!("Empty schema file: {} at {:?}", schema_name, path))?
111                } else {
112                    // Fall back to direct format (used in tests)
113                    // where the file directly contains: "{ label: Thing, ... }"
114                    serde_yaml::from_str::<FtmSchema>(&content).context(format!(
115                        "Failed to parse schema: {} at {:?}\nFirst 500 chars of content:\n{}",
116                        schema_name,
117                        path,
118                        &content.chars().take(500).collect::<String>()
119                    ))?
120                };
121
122                schemas.insert(schema_name.clone(), schema);
123            }
124        }
125
126        if schemas.is_empty() {
127            anyhow::bail!("No schemas found in cache directory");
128        }
129
130        println!("Loaded {} schemas", schemas.len());
131        Ok(Self { schemas })
132    }
133
134    /// Get a schema by name
135    pub fn get(&self, name: &str) -> Option<&FtmSchema> {
136        self.schemas.get(name)
137    }
138
139    /// Get all schema names
140    pub fn schema_names(&self) -> Vec<String> {
141        let mut names: Vec<_> = self.schemas.keys().cloned().collect();
142        names.sort();
143        names
144    }
145
146    /// Count of schemas
147    pub fn count(&self) -> usize {
148        self.schemas.len()
149    }
150
151    /// Resolve a schema with all inherited properties
152    pub fn resolve_inheritance(&self, schema_name: &str) -> Result<ResolvedSchema> {
153        let mut visited = HashSet::new();
154        let all_properties = self.resolve_properties_recursive(schema_name, &mut visited)?;
155
156        let mut visited_required = HashSet::new();
157        let all_required = self.resolve_required_recursive(schema_name, &mut visited_required)?;
158
159        let metadata = self
160            .get(schema_name)
161            .context(format!("Schema not found: {}", schema_name))?
162            .clone();
163
164        Ok(ResolvedSchema {
165            name: schema_name.to_string(),
166            all_properties,
167            all_required,
168            metadata,
169        })
170    }
171
172    /// Recursively resolve properties from parent schemas
173    fn resolve_properties_recursive(
174        &self,
175        schema_name: &str,
176        visited: &mut HashSet<String>,
177    ) -> Result<HashMap<String, FtmProperty>> {
178        // Check for circular inheritance (schema directly referencing itself in the current path)
179        if visited.contains(schema_name) {
180            anyhow::bail!("Circular inheritance detected: {}", schema_name);
181        }
182        visited.insert(schema_name.to_string());
183
184        let schema = self
185            .get(schema_name)
186            .context(format!("Schema not found: {}", schema_name))?;
187
188        let mut properties = HashMap::new();
189
190        // First, collect properties from parent schemas
191        if let Some(extends) = &schema.extends {
192            for parent_name in extends {
193                let parent_props = self.resolve_properties_recursive(parent_name, visited)?;
194                properties.extend(parent_props);
195            }
196        }
197
198        // Then, add/override with this schema's properties
199        properties.extend(schema.properties.clone());
200
201        // Remove from visited set to allow diamond inheritance patterns
202        visited.remove(schema_name);
203
204        Ok(properties)
205    }
206
207    /// Recursively resolve required fields from parent schemas
208    fn resolve_required_recursive(
209        &self,
210        schema_name: &str,
211        visited: &mut HashSet<String>,
212    ) -> Result<HashSet<String>> {
213        // Check for circular inheritance
214        if visited.contains(schema_name) {
215            anyhow::bail!("Circular inheritance detected: {}", schema_name);
216        }
217        visited.insert(schema_name.to_string());
218
219        let schema = self
220            .get(schema_name)
221            .context(format!("Schema not found: {}", schema_name))?;
222
223        let mut required = HashSet::new();
224
225        // First, collect required fields from parent schemas
226        if let Some(extends) = &schema.extends {
227            for parent_name in extends {
228                let parent_required = self.resolve_required_recursive(parent_name, visited)?;
229                required.extend(parent_required);
230            }
231        }
232
233        // Then, add this schema's required fields
234        if let Some(schema_required) = &schema.required {
235            required.extend(schema_required.iter().cloned());
236        }
237
238        // Remove from visited set to allow diamond inheritance patterns
239        visited.remove(schema_name);
240
241        Ok(required)
242    }
243
244    /// Get all concrete (non-abstract) schemas
245    pub fn concrete_schemas(&self) -> Vec<String> {
246        self.schemas
247            .iter()
248            .filter(|(_, schema)| !schema.abstract_.unwrap_or(false))
249            .map(|(name, _)| name.clone())
250            .collect()
251    }
252}
253
254/// A resolved schema with all properties flattened
255#[derive(Debug, Clone)]
256pub struct ResolvedSchema {
257    pub name: String,
258    pub all_properties: HashMap<String, FtmProperty>,
259    pub all_required: HashSet<String>,
260    pub metadata: FtmSchema,
261}
262
263impl ResolvedSchema {
264    /// Check if this schema is abstract
265    pub fn is_abstract(&self) -> bool {
266        self.metadata.abstract_.unwrap_or(false)
267    }
268
269    /// Get the label for this schema
270    pub fn label(&self) -> Option<&str> {
271        self.metadata.label.as_deref()
272    }
273
274    /// Get the description if available
275    pub fn description(&self) -> Option<&str> {
276        self.metadata.description.as_deref()
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use crate::FtmEntity;
283
284    use super::*;
285    use std::{
286        fs::File,
287        io::{BufRead, BufReader, Write},
288    };
289    use tempfile::TempDir;
290
291    fn create_test_schema(dir: &Path, name: &str, yaml: &str) {
292        let path = dir.join(format!("{}.yml", name));
293        let mut file = fs::File::create(path).unwrap();
294        file.write_all(yaml.as_bytes()).unwrap();
295    }
296
297    #[test]
298    fn test_schema_loading() {
299        let temp_dir = TempDir::new().unwrap();
300
301        create_test_schema(
302            temp_dir.path(),
303            "Thing",
304            r#"
305label: Thing
306abstract: true
307properties:
308  name:
309    label: Name
310    type: name
311"#,
312        );
313
314        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
315        assert_eq!(registry.count(), 1);
316        assert!(registry.get("Thing").is_some());
317    }
318
319    #[test]
320    fn test_inheritance_resolution() {
321        let temp_dir = TempDir::new().unwrap();
322
323        create_test_schema(
324            temp_dir.path(),
325            "Thing",
326            r#"
327label: Thing
328abstract: true
329properties:
330  name:
331    label: Name
332    type: name
333"#,
334        );
335
336        create_test_schema(
337            temp_dir.path(),
338            "Person",
339            r#"
340label: Person
341extends:
342  - Thing
343properties:
344  firstName:
345    label: First Name
346    type: name
347"#,
348        );
349
350        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
351        let person = registry.resolve_inheritance("Person").unwrap();
352
353        assert_eq!(person.all_properties.len(), 2);
354        assert!(person.all_properties.contains_key("name"));
355        assert!(person.all_properties.contains_key("firstName"));
356    }
357
358    #[test]
359    fn test_circular_inheritance() {
360        let temp_dir = TempDir::new().unwrap();
361
362        create_test_schema(
363            temp_dir.path(),
364            "A",
365            r#"
366label: A
367extends:
368  - B
369properties: {}
370"#,
371        );
372
373        create_test_schema(
374            temp_dir.path(),
375            "B",
376            r#"
377label: B
378extends:
379  - A
380properties: {}
381"#,
382        );
383
384        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
385        let result = registry.resolve_inheritance("A");
386        assert!(result.is_err());
387        assert!(result.unwrap_err().to_string().contains("Circular"));
388    }
389
390    /// Test with real FTM schemas from OpenSanctions
391    /// These schemas are based on the actual FollowTheMoney schema structure
392    #[test]
393    fn test_opensanctions_person_schema() {
394        let temp_dir = TempDir::new().unwrap();
395
396        // Thing - the base schema
397        create_test_schema(
398            temp_dir.path(),
399            "Thing",
400            r#"
401label: Thing
402abstract: true
403description: A basic object in a domain
404properties:
405  id:
406    label: ID
407    type: identifier
408    description: Unique identifier for the entity
409  schema:
410    label: Schema
411    type: string
412    description: The schema type for this entity
413  properties:
414    label: Properties
415    type: json
416    description: All properties as a JSON object
417"#,
418        );
419
420        // LegalEntity - intermediate abstract schema
421        create_test_schema(
422            temp_dir.path(),
423            "LegalEntity",
424            r#"
425label: Legal entity
426abstract: true
427extends:
428  - Thing
429description: A legal entity that can enter into legal relationships
430properties:
431  name:
432    label: Name
433    type: name
434    description: Primary name of the entity
435  alias:
436    label: Alias
437    type: name
438    description: Alternative names
439  country:
440    label: Country
441    type: country
442    description: Associated countries
443  jurisdiction:
444    label: Jurisdiction
445    type: country
446    description: Legal jurisdiction
447  notes:
448    label: Notes
449    type: text
450    description: Additional notes or comments
451  sourceUrl:
452    label: Source URL
453    type: url
454    description: URL of the source data
455  publisher:
456    label: Publisher
457    type: string
458    description: Data publisher name
459  publisherUrl:
460    label: Publisher URL
461    type: url
462    description: URL of the data publisher
463"#,
464        );
465
466        // Person - concrete schema
467        create_test_schema(
468            temp_dir.path(),
469            "Person",
470            r#"
471label: Person
472extends:
473  - LegalEntity
474description: An individual human being
475properties:
476  firstName:
477    label: First name
478    type: name
479    description: Given name
480  middleName:
481    label: Middle name
482    type: name
483    description: Middle name
484  lastName:
485    label: Last name
486    type: name
487    description: Family name
488  fatherName:
489    label: Father name
490    type: name
491    description: Patronymic name
492  motherName:
493    label: Mother name
494    type: name
495    description: Matronymic name
496  birthDate:
497    label: Date of birth
498    type: date
499    description: Date of birth
500  birthPlace:
501    label: Place of birth
502    type: string
503    description: Place of birth
504  deathDate:
505    label: Date of death
506    type: date
507    description: Date of death
508  nationality:
509    label: Nationality
510    type: country
511    description: Citizenship
512  idNumber:
513    label: ID number
514    type: identifier
515    description: National identity number
516  passportNumber:
517    label: Passport number
518    type: identifier
519    description: Passport number
520  email:
521    label: Email
522    type: email
523    description: Email address
524  phone:
525    label: Phone number
526    type: phone
527    description: Phone number
528  address:
529    label: Address
530    type: address
531    description: Physical address
532  position:
533    label: Position
534    type: string
535    description: Professional position
536  political:
537    label: Politically exposed
538    type: string
539    description: PEP designation
540"#,
541        );
542
543        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
544
545        // Verify schemas loaded
546        assert_eq!(registry.count(), 3);
547        assert!(registry.get("Thing").is_some());
548        assert!(registry.get("LegalEntity").is_some());
549        assert!(registry.get("Person").is_some());
550
551        // Test Person inherits from LegalEntity and Thing
552        let person = registry.resolve_inheritance("Person").unwrap();
553
554        // Should have properties from all three schemas
555        assert!(person.all_properties.contains_key("id")); // from Thing
556        assert!(person.all_properties.contains_key("name")); // from LegalEntity
557        assert!(person.all_properties.contains_key("firstName")); // from Person
558        assert!(person.all_properties.contains_key("birthDate")); // from Person
559        assert!(person.all_properties.contains_key("nationality")); // from Person
560
561        // Verify abstract schemas are marked correctly
562        assert!(registry.get("Thing").unwrap().abstract_.unwrap_or(false));
563        assert!(
564            registry
565                .get("LegalEntity")
566                .unwrap()
567                .abstract_
568                .unwrap_or(false)
569        );
570        assert!(!registry.get("Person").unwrap().abstract_.unwrap_or(false));
571    }
572
573    #[test]
574    fn test_opensanctions_organization_schema() {
575        let temp_dir = TempDir::new().unwrap();
576
577        // Reuse Thing and LegalEntity from above
578        create_test_schema(
579            temp_dir.path(),
580            "Thing",
581            r#"
582label: Thing
583abstract: true
584properties:
585  id:
586    label: ID
587    type: identifier
588"#,
589        );
590
591        create_test_schema(
592            temp_dir.path(),
593            "LegalEntity",
594            r#"
595label: Legal entity
596abstract: true
597extends:
598  - Thing
599properties:
600  name:
601    label: Name
602    type: name
603  country:
604    label: Country
605    type: country
606"#,
607        );
608
609        // Organization schema
610        create_test_schema(
611            temp_dir.path(),
612            "Organization",
613            r#"
614label: Organization
615extends:
616  - LegalEntity
617description: A legal entity representing a group or institution
618properties:
619  incorporationDate:
620    label: Incorporation date
621    type: date
622    description: Date of incorporation
623  dissolutionDate:
624    label: Dissolution date
625    type: date
626    description: Date of dissolution
627  taxNumber:
628    label: Tax number
629    type: identifier
630    description: Tax identification number
631  registrationNumber:
632    label: Registration number
633    type: identifier
634    description: Company registration number
635  legalForm:
636    label: Legal form
637    type: string
638    description: Type of legal entity (LLC, Corp, etc)
639  sector:
640    label: Sector
641    type: string
642    description: Industry sector
643  classification:
644    label: Classification
645    type: string
646    description: Business classification
647  address:
648    label: Address
649    type: address
650    description: Registered address
651  email:
652    label: Email
653    type: email
654    description: Contact email
655  phone:
656    label: Phone
657    type: phone
658    description: Contact phone
659  website:
660    label: Website
661    type: url
662    description: Company website
663"#,
664        );
665
666        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
667        let org = registry.resolve_inheritance("Organization").unwrap();
668
669        // Check inherited and direct properties
670        assert!(org.all_properties.contains_key("id"));
671        assert!(org.all_properties.contains_key("name"));
672        assert!(org.all_properties.contains_key("country"));
673        assert!(org.all_properties.contains_key("incorporationDate"));
674        assert!(org.all_properties.contains_key("taxNumber"));
675        assert!(org.all_properties.contains_key("website"));
676    }
677
678    #[test]
679    fn test_opensanctions_sanction_schema() {
680        let temp_dir = TempDir::new().unwrap();
681
682        // Base schemas
683        create_test_schema(
684            temp_dir.path(),
685            "Thing",
686            r#"
687label: Thing
688abstract: true
689properties:
690  id:
691    label: ID
692    type: identifier
693"#,
694        );
695
696        create_test_schema(
697            temp_dir.path(),
698            "Interval",
699            r#"
700label: Interval
701abstract: true
702extends:
703  - Thing
704description: An entity with a start and end date
705properties:
706  startDate:
707    label: Start date
708    type: date
709    description: Start date
710  endDate:
711    label: End date
712    type: date
713    description: End date
714"#,
715        );
716
717        // Sanction schema - common in OpenSanctions data
718        create_test_schema(
719            temp_dir.path(),
720            "Sanction",
721            r#"
722label: Sanction
723extends:
724  - Interval
725description: A legal sanction against a person or entity
726properties:
727  entity:
728    label: Entity
729    type: entity
730    description: Sanctioned entity
731  authority:
732    label: Authority
733    type: string
734    description: Sanctioning authority
735  sourceUrl:
736    label: Source URL
737    type: url
738    description: URL of sanction listing
739  program:
740    label: Program
741    type: string
742    description: Sanctions program name
743  reason:
744    label: Reason
745    type: text
746    description: Reason for sanction
747  provisions:
748    label: Provisions
749    type: string
750    description: Sanction provisions
751  country:
752    label: Country
753    type: country
754    description: Country imposing sanction
755  unscId:
756    label: UNSC ID
757    type: identifier
758    description: UN Security Council identifier
759  listingDate:
760    label: Listing date
761    type: date
762    description: Date added to sanctions list
763  duration:
764    label: Duration
765    type: string
766    description: Sanction duration
767"#,
768        );
769
770        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
771        let sanction = registry.resolve_inheritance("Sanction").unwrap();
772
773        // Verify complete property inheritance chain
774        assert!(sanction.all_properties.contains_key("id")); // from Thing
775        assert!(sanction.all_properties.contains_key("startDate")); // from Interval
776        assert!(sanction.all_properties.contains_key("endDate")); // from Interval
777        assert!(sanction.all_properties.contains_key("entity")); // from Sanction
778        assert!(sanction.all_properties.contains_key("authority")); // from Sanction
779        assert!(sanction.all_properties.contains_key("program")); // from Sanction
780        assert!(sanction.all_properties.contains_key("unscId")); // from Sanction
781
782        // Total properties should include all from chain
783        assert!(sanction.all_properties.len() >= 12);
784    }
785
786    #[test]
787    fn test_multiple_inheritance_paths() {
788        let temp_dir = TempDir::new().unwrap();
789
790        // Diamond inheritance pattern (common in FTM)
791        create_test_schema(
792            temp_dir.path(),
793            "Thing",
794            r#"
795label: Thing
796abstract: true
797properties:
798  id:
799    label: ID
800    type: identifier
801  name:
802    label: Name
803    type: name
804"#,
805        );
806
807        create_test_schema(
808            temp_dir.path(),
809            "LegalEntity",
810            r#"
811label: Legal entity
812abstract: true
813extends:
814  - Thing
815properties:
816  country:
817    label: Country
818    type: country
819"#,
820        );
821
822        create_test_schema(
823            temp_dir.path(),
824            "DirectorshipEntity",
825            r#"
826label: Directorship entity
827abstract: true
828extends:
829  - Thing
830properties:
831  role:
832    label: Role
833    type: string
834"#,
835        );
836
837        // Company extends both LegalEntity and DirectorshipEntity (diamond pattern)
838        create_test_schema(
839            temp_dir.path(),
840            "Company",
841            r#"
842label: Company
843extends:
844  - LegalEntity
845properties:
846  registrationNumber:
847    label: Registration number
848    type: identifier
849"#,
850        );
851
852        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
853        let company = registry.resolve_inheritance("Company").unwrap();
854
855        // Should have properties from all paths (no duplicates from Thing)
856        assert!(company.all_properties.contains_key("id"));
857        assert!(company.all_properties.contains_key("name"));
858        assert!(company.all_properties.contains_key("country"));
859        assert!(company.all_properties.contains_key("registrationNumber"));
860    }
861
862    #[test]
863    fn test_property_overriding() {
864        let temp_dir = TempDir::new().unwrap();
865
866        create_test_schema(
867            temp_dir.path(),
868            "Base",
869            r#"
870label: Base
871abstract: true
872properties:
873  value:
874    label: Original Value
875    type: string
876    description: Original description
877"#,
878        );
879
880        create_test_schema(
881            temp_dir.path(),
882            "Derived",
883            r#"
884label: Derived
885extends:
886  - Base
887properties:
888  value:
889    label: Overridden Value
890    type: number
891    description: New description
892  extra:
893    label: Extra
894    type: string
895"#,
896        );
897
898        let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
899        let derived = registry.resolve_inheritance("Derived").unwrap();
900
901        // Child schema properties should override parent
902        assert_eq!(derived.all_properties.len(), 2);
903        let value_prop = &derived.all_properties["value"];
904        assert_eq!(value_prop.label, Some("Overridden Value".to_string()));
905        assert_eq!(value_prop.type_, Some("number".to_string()));
906        assert_eq!(value_prop.description, Some("New description".to_string()));
907    }
908
909    #[test]
910    fn test_sample_sponsoring() {
911        // from https://dataresearchcenter.org/library/de_abgeordnetenwatch_sponsoring/
912        let test_file = "sample/de_abgeordnetenwatch_sponsoring.ftm.json.zst";
913        let test_file = BufReader::new(File::open(test_file).unwrap());
914        let test_data = zstd::decode_all(test_file).unwrap();
915        for line in test_data.lines() {
916            let line = line.unwrap();
917            let entity = FtmEntity::from_ftm_json(&line).unwrap();
918            // Verify we can access basic properties
919            assert!(!entity.id().is_empty());
920            assert!(!entity.schema().is_empty());
921        }
922    }
923
924    #[test]
925    fn test_from_ftm_json_person_schema() {
926        // Regression test: with #[serde(untagged)], Address (alphabetically first) was
927        // matching Person entities because both structs share the same required fields
928        // (id, schema) and all property fields are Option<Vec<String>>.
929        // height omitted: the generated struct uses Vec<f64> but FTM encodes it as a string
930        let json = r#"{"id": "e571b2b8ccfae7329036251acc47d0e833b280f5", "schema": "Person", "properties": {"motherName": ["nor"], "lastName": ["mention"], "nameSuffix": ["herself"], "birthDate": ["1961-05-20"], "birthPlace": ["data"], "nationality": ["et"], "appearance": ["to"], "religion": ["boy"], "profession": ["example"], "spokenLanguage": ["deu"], "abbreviation": ["Jamie Patterson"], "email": ["jamespalmer@example.com"], "incorporationDate": ["1959-11-10"], "taxStatus": ["eye"], "sector": ["democratic"], "registrationNumber": ["KKe-99272009"], "licenseNumber": ["rCG-26589103"], "opencorporatesUrl": ["http://www.avila-williams.biz/"], "bvdId": ["YFB-72857745"], "sayariId": ["kLt-25015135"], "brightQueryOrgId": ["DVr-24775349"], "icijId": ["real"], "name": ["Joseph Brown"], "description": ["Grow she future debate analysis much determine."], "alias": ["Jennifer Black"], "previousName": ["Anthony Davies"], "weakAlias": ["Nicole Smith"], "sourceUrl": ["http://www.wilson.com/"], "alephUrl": ["http://gilbert.com/"], "keywords": ["worker"], "createdAt": ["1998-05-22"], "retrievedAt": ["1998-11-28"]}}"#;
931
932        let entity = FtmEntity::from_ftm_json(json).unwrap();
933
934        assert_eq!(
935            entity.schema(),
936            "Person",
937            "expected Person schema, got {} — likely matched Address due to untagged enum ordering",
938            entity.schema()
939        );
940        assert_eq!(entity.id(), "e571b2b8ccfae7329036251acc47d0e833b280f5");
941
942        assert!(
943            matches!(entity, FtmEntity::Person(_)),
944            "entity should be FtmEntity::Person, got FtmEntity::{}",
945            entity.schema()
946        );
947    }
948
949    #[test]
950    fn test_sample_sidejobs() {
951        // from https://dataresearchcenter.org/library/de_abgeordnetenwatch_sidejobs/
952        let test_file = "sample/de_abgeordnetenwatch_sidejobs.ftm.json.zst";
953        let test_file = BufReader::new(File::open(test_file).unwrap());
954        let test_data = zstd::decode_all(test_file).unwrap();
955        for line in test_data.lines() {
956            let line = line.unwrap();
957            let entity = FtmEntity::from_ftm_json(&line).unwrap();
958            // Verify we can access basic properties
959            assert!(!entity.id().is_empty());
960            assert!(!entity.schema().is_empty());
961        }
962    }
963}