1use anyhow::{Context, Result};
6use serde::{Deserialize, Serialize};
7use std::collections::{HashMap, HashSet};
8use std::fs;
9use std::path::Path;
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct FtmSchema {
14 #[serde(default)]
15 pub label: Option<String>,
16 #[serde(default)]
17 pub plural: Option<String>,
18 #[serde(default)]
19 pub description: Option<String>,
20 #[serde(default)]
21 pub extends: Option<Vec<String>>,
22 #[serde(rename = "abstract", default)]
23 pub abstract_: Option<bool>,
24 #[serde(default)]
25 pub matchable: Option<bool>,
26 #[serde(default)]
27 pub featured: Option<Vec<String>>,
28 #[serde(default)]
29 pub required: Option<Vec<String>>,
30 #[serde(default)]
31 pub caption: Option<Vec<String>>,
32 #[serde(default)]
33 pub properties: HashMap<String, FtmProperty>,
34 #[serde(flatten)]
36 pub _extra: HashMap<String, serde_json::Value>,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct ReverseProperty {
42 pub name: String,
43 pub label: String,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct FtmProperty {
49 pub label: Option<String>,
50 #[serde(rename = "type", default)]
51 pub type_: Option<String>,
52 #[serde(default)]
53 pub description: Option<String>,
54 #[serde(default)]
55 pub reverse: Option<ReverseProperty>,
56 #[serde(default)]
57 pub range: Option<String>,
58 #[serde(default)]
59 pub stub: Option<bool>,
60 #[serde(default)]
61 pub deprecated: Option<bool>,
62 #[serde(flatten)]
64 pub _extra: HashMap<String, serde_json::Value>,
65}
66
67pub struct SchemaRegistry {
69 schemas: HashMap<String, FtmSchema>,
70}
71
72impl SchemaRegistry {
73 pub fn load_from_cache<P: AsRef<Path>>(cache_dir: P) -> Result<Self> {
75 let cache_path = cache_dir.as_ref();
76
77 if !cache_path.exists() {
78 anyhow::bail!("Cache directory does not exist: {:?}", cache_path);
79 }
80
81 let mut schemas = HashMap::new();
82
83 for entry in fs::read_dir(cache_path)? {
85 let entry = entry?;
86 let path = entry.path();
87
88 let ext = path.extension().and_then(|s| s.to_str());
89 if ext == Some("yml") || ext == Some("yaml") {
90 let schema_name = path
91 .file_stem()
92 .and_then(|s| s.to_str())
93 .context("Invalid schema filename")?
94 .to_string();
95
96 let content = fs::read_to_string(&path).context(format!(
97 "Failed to read schema file: {} at {:?}",
98 schema_name, path
99 ))?;
100
101 let schema = if let Ok(root) =
104 serde_yaml::from_str::<HashMap<String, FtmSchema>>(&content)
105 {
106 root.into_iter()
108 .next()
109 .map(|(_, schema)| schema)
110 .context(format!("Empty schema file: {} at {:?}", schema_name, path))?
111 } else {
112 serde_yaml::from_str::<FtmSchema>(&content).context(format!(
115 "Failed to parse schema: {} at {:?}\nFirst 500 chars of content:\n{}",
116 schema_name,
117 path,
118 &content.chars().take(500).collect::<String>()
119 ))?
120 };
121
122 schemas.insert(schema_name.clone(), schema);
123 }
124 }
125
126 if schemas.is_empty() {
127 anyhow::bail!("No schemas found in cache directory");
128 }
129
130 println!("Loaded {} schemas", schemas.len());
131 Ok(Self { schemas })
132 }
133
134 pub fn get(&self, name: &str) -> Option<&FtmSchema> {
136 self.schemas.get(name)
137 }
138
139 pub fn schema_names(&self) -> Vec<String> {
141 let mut names: Vec<_> = self.schemas.keys().cloned().collect();
142 names.sort();
143 names
144 }
145
146 pub fn count(&self) -> usize {
148 self.schemas.len()
149 }
150
151 pub fn resolve_inheritance(&self, schema_name: &str) -> Result<ResolvedSchema> {
153 let mut visited = HashSet::new();
154 let all_properties = self.resolve_properties_recursive(schema_name, &mut visited)?;
155
156 let mut visited_required = HashSet::new();
157 let all_required = self.resolve_required_recursive(schema_name, &mut visited_required)?;
158
159 let metadata = self
160 .get(schema_name)
161 .context(format!("Schema not found: {}", schema_name))?
162 .clone();
163
164 Ok(ResolvedSchema {
165 name: schema_name.to_string(),
166 all_properties,
167 all_required,
168 metadata,
169 })
170 }
171
172 fn resolve_properties_recursive(
174 &self,
175 schema_name: &str,
176 visited: &mut HashSet<String>,
177 ) -> Result<HashMap<String, FtmProperty>> {
178 if visited.contains(schema_name) {
180 anyhow::bail!("Circular inheritance detected: {}", schema_name);
181 }
182 visited.insert(schema_name.to_string());
183
184 let schema = self
185 .get(schema_name)
186 .context(format!("Schema not found: {}", schema_name))?;
187
188 let mut properties = HashMap::new();
189
190 if let Some(extends) = &schema.extends {
192 for parent_name in extends {
193 let parent_props = self.resolve_properties_recursive(parent_name, visited)?;
194 properties.extend(parent_props);
195 }
196 }
197
198 properties.extend(schema.properties.clone());
200
201 visited.remove(schema_name);
203
204 Ok(properties)
205 }
206
207 fn resolve_required_recursive(
209 &self,
210 schema_name: &str,
211 visited: &mut HashSet<String>,
212 ) -> Result<HashSet<String>> {
213 if visited.contains(schema_name) {
215 anyhow::bail!("Circular inheritance detected: {}", schema_name);
216 }
217 visited.insert(schema_name.to_string());
218
219 let schema = self
220 .get(schema_name)
221 .context(format!("Schema not found: {}", schema_name))?;
222
223 let mut required = HashSet::new();
224
225 if let Some(extends) = &schema.extends {
227 for parent_name in extends {
228 let parent_required = self.resolve_required_recursive(parent_name, visited)?;
229 required.extend(parent_required);
230 }
231 }
232
233 if let Some(schema_required) = &schema.required {
235 required.extend(schema_required.iter().cloned());
236 }
237
238 visited.remove(schema_name);
240
241 Ok(required)
242 }
243
244 pub fn concrete_schemas(&self) -> Vec<String> {
246 self.schemas
247 .iter()
248 .filter(|(_, schema)| !schema.abstract_.unwrap_or(false))
249 .map(|(name, _)| name.clone())
250 .collect()
251 }
252}
253
254#[derive(Debug, Clone)]
256pub struct ResolvedSchema {
257 pub name: String,
258 pub all_properties: HashMap<String, FtmProperty>,
259 pub all_required: HashSet<String>,
260 pub metadata: FtmSchema,
261}
262
263impl ResolvedSchema {
264 pub fn is_abstract(&self) -> bool {
266 self.metadata.abstract_.unwrap_or(false)
267 }
268
269 pub fn label(&self) -> Option<&str> {
271 self.metadata.label.as_deref()
272 }
273
274 pub fn description(&self) -> Option<&str> {
276 self.metadata.description.as_deref()
277 }
278}
279
280#[cfg(test)]
281mod tests {
282 use crate::FtmEntity;
283
284 use super::*;
285 use std::{
286 fs::File,
287 io::{BufRead, BufReader, Write},
288 };
289 use tempfile::TempDir;
290
291 fn create_test_schema(dir: &Path, name: &str, yaml: &str) {
292 let path = dir.join(format!("{}.yml", name));
293 let mut file = fs::File::create(path).unwrap();
294 file.write_all(yaml.as_bytes()).unwrap();
295 }
296
297 #[test]
298 fn test_schema_loading() {
299 let temp_dir = TempDir::new().unwrap();
300
301 create_test_schema(
302 temp_dir.path(),
303 "Thing",
304 r#"
305label: Thing
306abstract: true
307properties:
308 name:
309 label: Name
310 type: name
311"#,
312 );
313
314 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
315 assert_eq!(registry.count(), 1);
316 assert!(registry.get("Thing").is_some());
317 }
318
319 #[test]
320 fn test_inheritance_resolution() {
321 let temp_dir = TempDir::new().unwrap();
322
323 create_test_schema(
324 temp_dir.path(),
325 "Thing",
326 r#"
327label: Thing
328abstract: true
329properties:
330 name:
331 label: Name
332 type: name
333"#,
334 );
335
336 create_test_schema(
337 temp_dir.path(),
338 "Person",
339 r#"
340label: Person
341extends:
342 - Thing
343properties:
344 firstName:
345 label: First Name
346 type: name
347"#,
348 );
349
350 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
351 let person = registry.resolve_inheritance("Person").unwrap();
352
353 assert_eq!(person.all_properties.len(), 2);
354 assert!(person.all_properties.contains_key("name"));
355 assert!(person.all_properties.contains_key("firstName"));
356 }
357
358 #[test]
359 fn test_circular_inheritance() {
360 let temp_dir = TempDir::new().unwrap();
361
362 create_test_schema(
363 temp_dir.path(),
364 "A",
365 r#"
366label: A
367extends:
368 - B
369properties: {}
370"#,
371 );
372
373 create_test_schema(
374 temp_dir.path(),
375 "B",
376 r#"
377label: B
378extends:
379 - A
380properties: {}
381"#,
382 );
383
384 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
385 let result = registry.resolve_inheritance("A");
386 assert!(result.is_err());
387 assert!(result.unwrap_err().to_string().contains("Circular"));
388 }
389
390 #[test]
393 fn test_opensanctions_person_schema() {
394 let temp_dir = TempDir::new().unwrap();
395
396 create_test_schema(
398 temp_dir.path(),
399 "Thing",
400 r#"
401label: Thing
402abstract: true
403description: A basic object in a domain
404properties:
405 id:
406 label: ID
407 type: identifier
408 description: Unique identifier for the entity
409 schema:
410 label: Schema
411 type: string
412 description: The schema type for this entity
413 properties:
414 label: Properties
415 type: json
416 description: All properties as a JSON object
417"#,
418 );
419
420 create_test_schema(
422 temp_dir.path(),
423 "LegalEntity",
424 r#"
425label: Legal entity
426abstract: true
427extends:
428 - Thing
429description: A legal entity that can enter into legal relationships
430properties:
431 name:
432 label: Name
433 type: name
434 description: Primary name of the entity
435 alias:
436 label: Alias
437 type: name
438 description: Alternative names
439 country:
440 label: Country
441 type: country
442 description: Associated countries
443 jurisdiction:
444 label: Jurisdiction
445 type: country
446 description: Legal jurisdiction
447 notes:
448 label: Notes
449 type: text
450 description: Additional notes or comments
451 sourceUrl:
452 label: Source URL
453 type: url
454 description: URL of the source data
455 publisher:
456 label: Publisher
457 type: string
458 description: Data publisher name
459 publisherUrl:
460 label: Publisher URL
461 type: url
462 description: URL of the data publisher
463"#,
464 );
465
466 create_test_schema(
468 temp_dir.path(),
469 "Person",
470 r#"
471label: Person
472extends:
473 - LegalEntity
474description: An individual human being
475properties:
476 firstName:
477 label: First name
478 type: name
479 description: Given name
480 middleName:
481 label: Middle name
482 type: name
483 description: Middle name
484 lastName:
485 label: Last name
486 type: name
487 description: Family name
488 fatherName:
489 label: Father name
490 type: name
491 description: Patronymic name
492 motherName:
493 label: Mother name
494 type: name
495 description: Matronymic name
496 birthDate:
497 label: Date of birth
498 type: date
499 description: Date of birth
500 birthPlace:
501 label: Place of birth
502 type: string
503 description: Place of birth
504 deathDate:
505 label: Date of death
506 type: date
507 description: Date of death
508 nationality:
509 label: Nationality
510 type: country
511 description: Citizenship
512 idNumber:
513 label: ID number
514 type: identifier
515 description: National identity number
516 passportNumber:
517 label: Passport number
518 type: identifier
519 description: Passport number
520 email:
521 label: Email
522 type: email
523 description: Email address
524 phone:
525 label: Phone number
526 type: phone
527 description: Phone number
528 address:
529 label: Address
530 type: address
531 description: Physical address
532 position:
533 label: Position
534 type: string
535 description: Professional position
536 political:
537 label: Politically exposed
538 type: string
539 description: PEP designation
540"#,
541 );
542
543 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
544
545 assert_eq!(registry.count(), 3);
547 assert!(registry.get("Thing").is_some());
548 assert!(registry.get("LegalEntity").is_some());
549 assert!(registry.get("Person").is_some());
550
551 let person = registry.resolve_inheritance("Person").unwrap();
553
554 assert!(person.all_properties.contains_key("id")); assert!(person.all_properties.contains_key("name")); assert!(person.all_properties.contains_key("firstName")); assert!(person.all_properties.contains_key("birthDate")); assert!(person.all_properties.contains_key("nationality")); assert!(registry.get("Thing").unwrap().abstract_.unwrap_or(false));
563 assert!(
564 registry
565 .get("LegalEntity")
566 .unwrap()
567 .abstract_
568 .unwrap_or(false)
569 );
570 assert!(!registry.get("Person").unwrap().abstract_.unwrap_or(false));
571 }
572
573 #[test]
574 fn test_opensanctions_organization_schema() {
575 let temp_dir = TempDir::new().unwrap();
576
577 create_test_schema(
579 temp_dir.path(),
580 "Thing",
581 r#"
582label: Thing
583abstract: true
584properties:
585 id:
586 label: ID
587 type: identifier
588"#,
589 );
590
591 create_test_schema(
592 temp_dir.path(),
593 "LegalEntity",
594 r#"
595label: Legal entity
596abstract: true
597extends:
598 - Thing
599properties:
600 name:
601 label: Name
602 type: name
603 country:
604 label: Country
605 type: country
606"#,
607 );
608
609 create_test_schema(
611 temp_dir.path(),
612 "Organization",
613 r#"
614label: Organization
615extends:
616 - LegalEntity
617description: A legal entity representing a group or institution
618properties:
619 incorporationDate:
620 label: Incorporation date
621 type: date
622 description: Date of incorporation
623 dissolutionDate:
624 label: Dissolution date
625 type: date
626 description: Date of dissolution
627 taxNumber:
628 label: Tax number
629 type: identifier
630 description: Tax identification number
631 registrationNumber:
632 label: Registration number
633 type: identifier
634 description: Company registration number
635 legalForm:
636 label: Legal form
637 type: string
638 description: Type of legal entity (LLC, Corp, etc)
639 sector:
640 label: Sector
641 type: string
642 description: Industry sector
643 classification:
644 label: Classification
645 type: string
646 description: Business classification
647 address:
648 label: Address
649 type: address
650 description: Registered address
651 email:
652 label: Email
653 type: email
654 description: Contact email
655 phone:
656 label: Phone
657 type: phone
658 description: Contact phone
659 website:
660 label: Website
661 type: url
662 description: Company website
663"#,
664 );
665
666 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
667 let org = registry.resolve_inheritance("Organization").unwrap();
668
669 assert!(org.all_properties.contains_key("id"));
671 assert!(org.all_properties.contains_key("name"));
672 assert!(org.all_properties.contains_key("country"));
673 assert!(org.all_properties.contains_key("incorporationDate"));
674 assert!(org.all_properties.contains_key("taxNumber"));
675 assert!(org.all_properties.contains_key("website"));
676 }
677
678 #[test]
679 fn test_opensanctions_sanction_schema() {
680 let temp_dir = TempDir::new().unwrap();
681
682 create_test_schema(
684 temp_dir.path(),
685 "Thing",
686 r#"
687label: Thing
688abstract: true
689properties:
690 id:
691 label: ID
692 type: identifier
693"#,
694 );
695
696 create_test_schema(
697 temp_dir.path(),
698 "Interval",
699 r#"
700label: Interval
701abstract: true
702extends:
703 - Thing
704description: An entity with a start and end date
705properties:
706 startDate:
707 label: Start date
708 type: date
709 description: Start date
710 endDate:
711 label: End date
712 type: date
713 description: End date
714"#,
715 );
716
717 create_test_schema(
719 temp_dir.path(),
720 "Sanction",
721 r#"
722label: Sanction
723extends:
724 - Interval
725description: A legal sanction against a person or entity
726properties:
727 entity:
728 label: Entity
729 type: entity
730 description: Sanctioned entity
731 authority:
732 label: Authority
733 type: string
734 description: Sanctioning authority
735 sourceUrl:
736 label: Source URL
737 type: url
738 description: URL of sanction listing
739 program:
740 label: Program
741 type: string
742 description: Sanctions program name
743 reason:
744 label: Reason
745 type: text
746 description: Reason for sanction
747 provisions:
748 label: Provisions
749 type: string
750 description: Sanction provisions
751 country:
752 label: Country
753 type: country
754 description: Country imposing sanction
755 unscId:
756 label: UNSC ID
757 type: identifier
758 description: UN Security Council identifier
759 listingDate:
760 label: Listing date
761 type: date
762 description: Date added to sanctions list
763 duration:
764 label: Duration
765 type: string
766 description: Sanction duration
767"#,
768 );
769
770 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
771 let sanction = registry.resolve_inheritance("Sanction").unwrap();
772
773 assert!(sanction.all_properties.contains_key("id")); assert!(sanction.all_properties.contains_key("startDate")); assert!(sanction.all_properties.contains_key("endDate")); assert!(sanction.all_properties.contains_key("entity")); assert!(sanction.all_properties.contains_key("authority")); assert!(sanction.all_properties.contains_key("program")); assert!(sanction.all_properties.contains_key("unscId")); assert!(sanction.all_properties.len() >= 12);
784 }
785
786 #[test]
787 fn test_multiple_inheritance_paths() {
788 let temp_dir = TempDir::new().unwrap();
789
790 create_test_schema(
792 temp_dir.path(),
793 "Thing",
794 r#"
795label: Thing
796abstract: true
797properties:
798 id:
799 label: ID
800 type: identifier
801 name:
802 label: Name
803 type: name
804"#,
805 );
806
807 create_test_schema(
808 temp_dir.path(),
809 "LegalEntity",
810 r#"
811label: Legal entity
812abstract: true
813extends:
814 - Thing
815properties:
816 country:
817 label: Country
818 type: country
819"#,
820 );
821
822 create_test_schema(
823 temp_dir.path(),
824 "DirectorshipEntity",
825 r#"
826label: Directorship entity
827abstract: true
828extends:
829 - Thing
830properties:
831 role:
832 label: Role
833 type: string
834"#,
835 );
836
837 create_test_schema(
839 temp_dir.path(),
840 "Company",
841 r#"
842label: Company
843extends:
844 - LegalEntity
845properties:
846 registrationNumber:
847 label: Registration number
848 type: identifier
849"#,
850 );
851
852 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
853 let company = registry.resolve_inheritance("Company").unwrap();
854
855 assert!(company.all_properties.contains_key("id"));
857 assert!(company.all_properties.contains_key("name"));
858 assert!(company.all_properties.contains_key("country"));
859 assert!(company.all_properties.contains_key("registrationNumber"));
860 }
861
862 #[test]
863 fn test_property_overriding() {
864 let temp_dir = TempDir::new().unwrap();
865
866 create_test_schema(
867 temp_dir.path(),
868 "Base",
869 r#"
870label: Base
871abstract: true
872properties:
873 value:
874 label: Original Value
875 type: string
876 description: Original description
877"#,
878 );
879
880 create_test_schema(
881 temp_dir.path(),
882 "Derived",
883 r#"
884label: Derived
885extends:
886 - Base
887properties:
888 value:
889 label: Overridden Value
890 type: number
891 description: New description
892 extra:
893 label: Extra
894 type: string
895"#,
896 );
897
898 let registry = SchemaRegistry::load_from_cache(temp_dir.path()).unwrap();
899 let derived = registry.resolve_inheritance("Derived").unwrap();
900
901 assert_eq!(derived.all_properties.len(), 2);
903 let value_prop = &derived.all_properties["value"];
904 assert_eq!(value_prop.label, Some("Overridden Value".to_string()));
905 assert_eq!(value_prop.type_, Some("number".to_string()));
906 assert_eq!(value_prop.description, Some("New description".to_string()));
907 }
908
909 #[test]
910 fn test_sample_sponsoring() {
911 let test_file = "sample/de_abgeordnetenwatch_sponsoring.ftm.json.zst";
913 let test_file = BufReader::new(File::open(test_file).unwrap());
914 let test_data = zstd::decode_all(test_file).unwrap();
915 for line in test_data.lines() {
916 let line = line.unwrap();
917 let entity = FtmEntity::from_ftm_json(&line).unwrap();
918 assert!(!entity.id().is_empty());
920 assert!(!entity.schema().is_empty());
921 }
922 }
923
924 #[test]
925 fn test_from_ftm_json_person_schema() {
926 let json = r#"{"id": "e571b2b8ccfae7329036251acc47d0e833b280f5", "schema": "Person", "properties": {"motherName": ["nor"], "lastName": ["mention"], "nameSuffix": ["herself"], "birthDate": ["1961-05-20"], "birthPlace": ["data"], "nationality": ["et"], "appearance": ["to"], "religion": ["boy"], "profession": ["example"], "spokenLanguage": ["deu"], "abbreviation": ["Jamie Patterson"], "email": ["jamespalmer@example.com"], "incorporationDate": ["1959-11-10"], "taxStatus": ["eye"], "sector": ["democratic"], "registrationNumber": ["KKe-99272009"], "licenseNumber": ["rCG-26589103"], "opencorporatesUrl": ["http://www.avila-williams.biz/"], "bvdId": ["YFB-72857745"], "sayariId": ["kLt-25015135"], "brightQueryOrgId": ["DVr-24775349"], "icijId": ["real"], "name": ["Joseph Brown"], "description": ["Grow she future debate analysis much determine."], "alias": ["Jennifer Black"], "previousName": ["Anthony Davies"], "weakAlias": ["Nicole Smith"], "sourceUrl": ["http://www.wilson.com/"], "alephUrl": ["http://gilbert.com/"], "keywords": ["worker"], "createdAt": ["1998-05-22"], "retrievedAt": ["1998-11-28"]}}"#;
931
932 let entity = FtmEntity::from_ftm_json(json).unwrap();
933
934 assert_eq!(
935 entity.schema(),
936 "Person",
937 "expected Person schema, got {} — likely matched Address due to untagged enum ordering",
938 entity.schema()
939 );
940 assert_eq!(entity.id(), "e571b2b8ccfae7329036251acc47d0e833b280f5");
941
942 assert!(
943 matches!(entity, FtmEntity::Person(_)),
944 "entity should be FtmEntity::Person, got FtmEntity::{}",
945 entity.schema()
946 );
947 }
948
949 #[test]
950 fn test_sample_sidejobs() {
951 let test_file = "sample/de_abgeordnetenwatch_sidejobs.ftm.json.zst";
953 let test_file = BufReader::new(File::open(test_file).unwrap());
954 let test_data = zstd::decode_all(test_file).unwrap();
955 for line in test_data.lines() {
956 let line = line.unwrap();
957 let entity = FtmEntity::from_ftm_json(&line).unwrap();
958 assert!(!entity.id().is_empty());
960 assert!(!entity.schema().is_empty());
961 }
962 }
963}