Skip to main content

thing_matcher/
models.rs

1//! Data models for things, aligned with `schema.org/Thing`.
2//!
3//! This module is intentionally **logic-free**: it defines the types that
4//! flow through the matching engine but contains no matching code itself.
5//! See [`crate::matcher`] for the engine and [`crate::normalizer`] for the
6//! text transformations that the matcher applies to these fields.
7//!
8//! All public types here are `Serialize + Deserialize` so they round-trip
9//! through JSON, MessagePack, or any other `serde` format.
10//!
11//! ## Schema.org alignment
12//!
13//! The fields of [`Thing`] correspond to the properties of
14//! `schema.org/Thing`:
15//!
16//! | Rust field | schema.org property |
17//! |---|---|
18//! | `name` | `name` |
19//! | `alternate_names` | `alternateName` |
20//! | `description` | `description` |
21//! | `disambiguating_description` | `disambiguatingDescription` |
22//! | `identifiers` | `identifier` (as `PropertyValue`) |
23//! | `url` | `url` |
24//! | `image` | `image` |
25//! | `same_as` | `sameAs` |
26//! | `main_entity_of_page` | `mainEntityOfPage` |
27//! | `additional_types` | `additionalType` |
28//! | `subject_of` | `subjectOf` |
29//! | `owner` | `owner` |
30//!
31//! ## Building a thing
32//!
33//! Prefer [`Thing::builder`] over constructing the struct literal — the
34//! builder accepts `impl Into<String>` so call-sites can pass `&str`,
35//! `String`, or owned values interchangeably.
36//!
37//! ```
38//! use thing_matcher::Thing;
39//!
40//! let t = Thing::builder()
41//!     .name("Eiffel Tower")
42//!     .add_alternate_name("La Tour Eiffel")
43//!     .url("https://www.toureiffel.paris/")
44//!     .build();
45//!
46//! assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
47//! ```
48
49use serde::{Deserialize, Serialize};
50
51/// External identifier for a thing, modelled on `schema.org/PropertyValue`.
52///
53/// Two `Identifier` values are equal iff both their `property_id` and their
54/// `value` are equal. Equality is structural — no per-scheme
55/// canonicalisation is performed.
56///
57/// `property_id` is the issuer or vocabulary that names the identifier,
58/// such as `"wikidata"`, `"isbn"`, `"doi"`, `"gtin"`, or a fully-qualified
59/// URL. `value` is the identifier string itself.
60///
61/// # Example
62///
63/// ```
64/// use thing_matcher::Identifier;
65///
66/// let a = Identifier::new("wikidata", "Q243").unwrap();
67/// let b = Identifier::new("wikidata", " Q243 ").unwrap();
68/// assert_eq!(a, b, "values are trimmed at construction");
69/// assert!(Identifier::new("wikidata", "").is_none());
70/// ```
71#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
72pub struct Identifier {
73    /// Issuer or vocabulary that scopes the identifier, e.g.
74    /// `"wikidata"`, `"isbn"`, `"doi"`, `"gtin"`, or a URL.
75    pub property_id: String,
76    /// The identifier value, trimmed of surrounding whitespace.
77    pub value: String,
78}
79
80impl Identifier {
81    /// Construct an [`Identifier`], trimming the value and property id of
82    /// surrounding whitespace. Returns `None` if either trimmed component
83    /// is empty.
84    ///
85    /// No further normalisation is applied — different vocabularies have
86    /// different rules and the crate makes no assumptions.
87    ///
88    /// # Example
89    ///
90    /// ```
91    /// use thing_matcher::Identifier;
92    ///
93    /// let id = Identifier::new("wikidata", "  Q243  ").unwrap();
94    /// assert_eq!(id.value, "Q243");
95    /// assert_eq!(id.property_id, "wikidata");
96    ///
97    /// assert!(Identifier::new("wikidata", "   ").is_none());
98    /// assert!(Identifier::new("   ", "Q243").is_none());
99    /// ```
100    pub fn new(property_id: impl Into<String>, value: impl Into<String>) -> Option<Self> {
101        let property_id = property_id.into().trim().to_string();
102        let value = value.into().trim().to_string();
103        if property_id.is_empty() || value.is_empty() {
104            None
105        } else {
106            Some(Self { property_id, value })
107        }
108    }
109}
110
111/// Core data structure for a thing, aligned with `schema.org/Thing`.
112///
113/// Every field is optional (or defaults to empty). The matcher tolerates
114/// missing data field-by-field — a `None` value never penalises a thing.
115/// See [`crate::matcher::MatchingEngine::match_things`] for how missing
116/// fields affect the weighted score.
117///
118/// Construct via [`Thing::builder`] rather than struct literal syntax so
119/// the call-site stays compact and forward-compatible if fields are added.
120///
121/// # Example
122///
123/// ```
124/// use thing_matcher::Thing;
125///
126/// let t = Thing::builder()
127///     .name("Eiffel Tower")
128///     .add_alternate_name("La Tour Eiffel")
129///     .description("Wrought-iron lattice tower on the Champ de Mars in Paris.")
130///     .url("https://www.toureiffel.paris/")
131///     .build();
132///
133/// assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
134/// assert_eq!(t.alternate_names, vec!["La Tour Eiffel".to_string()]);
135/// ```
136///
137/// `Thing` round-trips through `serde`.
138///
139/// ```
140/// # use thing_matcher::Thing;
141/// let t = Thing::builder().name("Eiffel Tower").build();
142/// let json = serde_json::to_string(&t).unwrap();
143/// let back: Thing = serde_json::from_str(&json).unwrap();
144/// assert_eq!(t, back);
145/// ```
146#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
147#[non_exhaustive]
148pub struct Thing {
149    /// Primary canonical name. Corresponds to `schema.org/name`.
150    pub name: Option<String>,
151
152    /// Aliases, endonyms, or translations. Corresponds to
153    /// `schema.org/alternateName`. The matcher takes the best score
154    /// across the cartesian product of primary + alternates.
155    pub alternate_names: Vec<String>,
156
157    /// Free-form description. Corresponds to `schema.org/description`.
158    pub description: Option<String>,
159
160    /// Short disambiguating description. Corresponds to
161    /// `schema.org/disambiguatingDescription`.
162    pub disambiguating_description: Option<String>,
163
164    /// Scheme-scoped external identifiers. Corresponds to
165    /// `schema.org/identifier` modelled as `PropertyValue`. Sharing any
166    /// one `(property_id, value)` pair across two things is a
167    /// deterministic match.
168    pub identifiers: Vec<Identifier>,
169
170    /// Canonical URL of the item. Corresponds to `schema.org/url`.
171    pub url: Option<String>,
172
173    /// URL of a representative image. Corresponds to `schema.org/image`.
174    pub image: Option<String>,
175
176    /// Reference URLs that unambiguously indicate the same item, e.g. a
177    /// Wikipedia article or an authority record. Corresponds to
178    /// `schema.org/sameAs`.
179    pub same_as: Vec<String>,
180
181    /// Page (URL) for which this thing is the main entity. Corresponds
182    /// to `schema.org/mainEntityOfPage`.
183    pub main_entity_of_page: Option<String>,
184
185    /// Additional types from external vocabularies, typically schema.org
186    /// subtypes or other ontology URIs. Corresponds to
187    /// `schema.org/additionalType`.
188    pub additional_types: Vec<String>,
189
190    /// Works or events about this thing (URLs). Corresponds to
191    /// `schema.org/subjectOf`.
192    pub subject_of: Vec<String>,
193
194    /// Person or organisation that owns this thing. Corresponds to
195    /// `schema.org/owner`. Stored as a string (a name or URL) — the
196    /// crate does not model `Person` / `Organization` separately.
197    pub owner: Option<String>,
198
199    /// Local identifier issued by the originating system. Not
200    /// normalised, not scored — different organisations may issue
201    /// colliding values. Kept for round-trip honesty.
202    pub local_id: Option<String>,
203}
204
205impl Thing {
206    /// Begin constructing a [`Thing`] with the [`ThingBuilder`].
207    ///
208    /// All fields default to `None` / empty until a setter is called.
209    ///
210    /// # Example
211    ///
212    /// ```
213    /// use thing_matcher::Thing;
214    ///
215    /// let t = Thing::builder()
216    ///     .name("Big Ben")
217    ///     .build();
218    ///
219    /// assert_eq!(t.name.as_deref(), Some("Big Ben"));
220    /// ```
221    pub fn builder() -> ThingBuilder {
222        ThingBuilder::default()
223    }
224
225    /// Validate that the thing carries a primary name.
226    ///
227    /// Returns `Ok(())` if `name` is set. Otherwise returns
228    /// [`crate::MatchingError::MissingField`].
229    ///
230    /// This is **not** invoked automatically by the matcher — call it at
231    /// the system boundary when you ingest data, not on every
232    /// comparison.
233    ///
234    /// # Example
235    ///
236    /// ```
237    /// use thing_matcher::Thing;
238    ///
239    /// assert!(Thing::builder().name("Eiffel Tower").build().validate().is_ok());
240    /// assert!(Thing::builder().build().validate().is_err());
241    /// ```
242    pub fn validate(&self) -> crate::Result<()> {
243        if self.name.is_none() {
244            return Err(crate::MatchingError::MissingField(
245                "name is required".to_string(),
246            ));
247        }
248        Ok(())
249    }
250}
251
252/// Fluent builder for [`Thing`].
253///
254/// All string setters accept `impl Into<String>` so call-sites may pass
255/// `&str`, `String`, or `&String` interchangeably without explicit
256/// conversion.
257///
258/// # Example
259///
260/// ```
261/// use thing_matcher::{Thing, ThingBuilder};
262///
263/// let t: Thing = ThingBuilder::default()
264///     .name(String::from("Eiffel Tower"))
265///     .add_alternate_name("La Tour Eiffel")
266///     .url("https://www.toureiffel.paris/")
267///     .add_same_as("https://www.wikidata.org/wiki/Q243")
268///     .build();
269///
270/// assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
271/// assert_eq!(t.same_as.len(), 1);
272/// ```
273#[derive(Default)]
274pub struct ThingBuilder {
275    name: Option<String>,
276    alternate_names: Vec<String>,
277    description: Option<String>,
278    disambiguating_description: Option<String>,
279    identifiers: Vec<Identifier>,
280    url: Option<String>,
281    image: Option<String>,
282    same_as: Vec<String>,
283    main_entity_of_page: Option<String>,
284    additional_types: Vec<String>,
285    subject_of: Vec<String>,
286    owner: Option<String>,
287    local_id: Option<String>,
288}
289
290impl ThingBuilder {
291    /// Set the primary canonical name.
292    ///
293    /// ```
294    /// # use thing_matcher::Thing;
295    /// let t = Thing::builder().name("Eiffel Tower").build();
296    /// assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
297    /// ```
298    pub fn name<S: Into<String>>(mut self, value: S) -> Self {
299        self.name = Some(value.into());
300        self
301    }
302
303    /// Replace the entire list of alternate names.
304    ///
305    /// ```
306    /// # use thing_matcher::Thing;
307    /// let t = Thing::builder()
308    ///     .alternate_names(vec!["La Tour Eiffel".into(), "Tour Eiffel".into()])
309    ///     .build();
310    /// assert_eq!(t.alternate_names.len(), 2);
311    /// ```
312    pub fn alternate_names(mut self, value: Vec<String>) -> Self {
313        self.alternate_names = value;
314        self
315    }
316
317    /// Append a single alternate name.
318    ///
319    /// ```
320    /// # use thing_matcher::Thing;
321    /// let t = Thing::builder()
322    ///     .add_alternate_name("La Tour Eiffel")
323    ///     .add_alternate_name("Tour Eiffel")
324    ///     .build();
325    /// assert_eq!(t.alternate_names.len(), 2);
326    /// ```
327    pub fn add_alternate_name<S: Into<String>>(mut self, value: S) -> Self {
328        self.alternate_names.push(value.into());
329        self
330    }
331
332    /// Set the free-form description.
333    ///
334    /// ```
335    /// # use thing_matcher::Thing;
336    /// let t = Thing::builder().description("A landmark in Paris.").build();
337    /// assert_eq!(t.description.as_deref(), Some("A landmark in Paris."));
338    /// ```
339    pub fn description<S: Into<String>>(mut self, value: S) -> Self {
340        self.description = Some(value.into());
341        self
342    }
343
344    /// Set the disambiguating description.
345    ///
346    /// ```
347    /// # use thing_matcher::Thing;
348    /// let t = Thing::builder().disambiguating_description("The Paris one.").build();
349    /// assert_eq!(t.disambiguating_description.as_deref(), Some("The Paris one."));
350    /// ```
351    pub fn disambiguating_description<S: Into<String>>(mut self, value: S) -> Self {
352        self.disambiguating_description = Some(value.into());
353        self
354    }
355
356    /// Replace the entire list of identifiers.
357    ///
358    /// ```
359    /// # use thing_matcher::{Thing, Identifier};
360    /// let id = Identifier::new("wikidata", "Q243").unwrap();
361    /// let t = Thing::builder().identifiers(vec![id.clone()]).build();
362    /// assert_eq!(t.identifiers, vec![id]);
363    /// ```
364    pub fn identifiers(mut self, value: Vec<Identifier>) -> Self {
365        self.identifiers = value;
366        self
367    }
368
369    /// Append a single identifier.
370    ///
371    /// ```
372    /// # use thing_matcher::{Thing, Identifier};
373    /// let t = Thing::builder()
374    ///     .add_identifier(Identifier::new("wikidata", "Q243").unwrap())
375    ///     .build();
376    /// assert_eq!(t.identifiers.len(), 1);
377    /// ```
378    pub fn add_identifier(mut self, value: Identifier) -> Self {
379        self.identifiers.push(value);
380        self
381    }
382
383    /// Set the canonical URL.
384    ///
385    /// ```
386    /// # use thing_matcher::Thing;
387    /// let t = Thing::builder().url("https://www.toureiffel.paris/").build();
388    /// assert_eq!(t.url.as_deref(), Some("https://www.toureiffel.paris/"));
389    /// ```
390    pub fn url<S: Into<String>>(mut self, value: S) -> Self {
391        self.url = Some(value.into());
392        self
393    }
394
395    /// Set the representative image URL.
396    ///
397    /// ```
398    /// # use thing_matcher::Thing;
399    /// let t = Thing::builder().image("https://example.org/eiffel.jpg").build();
400    /// assert_eq!(t.image.as_deref(), Some("https://example.org/eiffel.jpg"));
401    /// ```
402    pub fn image<S: Into<String>>(mut self, value: S) -> Self {
403        self.image = Some(value.into());
404        self
405    }
406
407    /// Replace the entire `sameAs` list.
408    ///
409    /// ```
410    /// # use thing_matcher::Thing;
411    /// let t = Thing::builder()
412    ///     .same_as(vec![
413    ///         "https://www.wikidata.org/wiki/Q243".into(),
414    ///         "https://en.wikipedia.org/wiki/Eiffel_Tower".into(),
415    ///     ])
416    ///     .build();
417    /// assert_eq!(t.same_as.len(), 2);
418    /// ```
419    pub fn same_as(mut self, value: Vec<String>) -> Self {
420        self.same_as = value;
421        self
422    }
423
424    /// Append a single `sameAs` URL.
425    ///
426    /// ```
427    /// # use thing_matcher::Thing;
428    /// let t = Thing::builder()
429    ///     .add_same_as("https://www.wikidata.org/wiki/Q243")
430    ///     .build();
431    /// assert_eq!(t.same_as.len(), 1);
432    /// ```
433    pub fn add_same_as<S: Into<String>>(mut self, value: S) -> Self {
434        self.same_as.push(value.into());
435        self
436    }
437
438    /// Set the `mainEntityOfPage` URL.
439    ///
440    /// ```
441    /// # use thing_matcher::Thing;
442    /// let t = Thing::builder()
443    ///     .main_entity_of_page("https://en.wikipedia.org/wiki/Eiffel_Tower")
444    ///     .build();
445    /// assert_eq!(
446    ///     t.main_entity_of_page.as_deref(),
447    ///     Some("https://en.wikipedia.org/wiki/Eiffel_Tower"),
448    /// );
449    /// ```
450    pub fn main_entity_of_page<S: Into<String>>(mut self, value: S) -> Self {
451        self.main_entity_of_page = Some(value.into());
452        self
453    }
454
455    /// Replace the entire list of `additionalType` URIs.
456    ///
457    /// ```
458    /// # use thing_matcher::Thing;
459    /// let t = Thing::builder()
460    ///     .additional_types(vec!["https://schema.org/Landmark".into()])
461    ///     .build();
462    /// assert_eq!(t.additional_types.len(), 1);
463    /// ```
464    pub fn additional_types(mut self, value: Vec<String>) -> Self {
465        self.additional_types = value;
466        self
467    }
468
469    /// Append a single `additionalType` URI.
470    ///
471    /// ```
472    /// # use thing_matcher::Thing;
473    /// let t = Thing::builder()
474    ///     .add_additional_type("https://schema.org/Landmark")
475    ///     .build();
476    /// assert_eq!(t.additional_types.len(), 1);
477    /// ```
478    pub fn add_additional_type<S: Into<String>>(mut self, value: S) -> Self {
479        self.additional_types.push(value.into());
480        self
481    }
482
483    /// Replace the entire `subjectOf` list.
484    pub fn subject_of(mut self, value: Vec<String>) -> Self {
485        self.subject_of = value;
486        self
487    }
488
489    /// Append a single `subjectOf` URL.
490    pub fn add_subject_of<S: Into<String>>(mut self, value: S) -> Self {
491        self.subject_of.push(value.into());
492        self
493    }
494
495    /// Set the owner (person or organisation, as a string).
496    pub fn owner<S: Into<String>>(mut self, value: S) -> Self {
497        self.owner = Some(value.into());
498        self
499    }
500
501    /// Set the local identifier.
502    ///
503    /// ```
504    /// # use thing_matcher::Thing;
505    /// let t = Thing::builder().local_id("REF-12345").build();
506    /// assert_eq!(t.local_id.as_deref(), Some("REF-12345"));
507    /// ```
508    pub fn local_id<S: Into<String>>(mut self, value: S) -> Self {
509        self.local_id = Some(value.into());
510        self
511    }
512
513    /// Consume the builder and produce the [`Thing`].
514    ///
515    /// ```
516    /// # use thing_matcher::Thing;
517    /// let t = Thing::builder().name("Big Ben").build();
518    /// assert!(t.url.is_none());
519    /// ```
520    pub fn build(self) -> Thing {
521        Thing {
522            name: self.name,
523            alternate_names: self.alternate_names,
524            description: self.description,
525            disambiguating_description: self.disambiguating_description,
526            identifiers: self.identifiers,
527            url: self.url,
528            image: self.image,
529            same_as: self.same_as,
530            main_entity_of_page: self.main_entity_of_page,
531            additional_types: self.additional_types,
532            subject_of: self.subject_of,
533            owner: self.owner,
534            local_id: self.local_id,
535        }
536    }
537}
538
539#[cfg(test)]
540mod tests {
541    use super::*;
542
543    #[test]
544    fn thing_builder_starts_empty() {
545        let t = Thing::builder().build();
546        assert!(t.name.is_none());
547        assert!(t.alternate_names.is_empty());
548        assert!(t.description.is_none());
549        assert!(t.disambiguating_description.is_none());
550        assert!(t.identifiers.is_empty());
551        assert!(t.url.is_none());
552        assert!(t.image.is_none());
553        assert!(t.same_as.is_empty());
554        assert!(t.main_entity_of_page.is_none());
555        assert!(t.additional_types.is_empty());
556        assert!(t.subject_of.is_empty());
557        assert!(t.owner.is_none());
558        assert!(t.local_id.is_none());
559    }
560
561    #[test]
562    fn thing_builder_accepts_str_and_string() {
563        let t = Thing::builder()
564            .name("Eiffel Tower")
565            .add_alternate_name(String::from("La Tour Eiffel"))
566            .build();
567        assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
568        assert_eq!(t.alternate_names, vec!["La Tour Eiffel".to_string()]);
569    }
570
571    #[test]
572    fn thing_validate_requires_a_name() {
573        assert!(
574            Thing::builder()
575                .name("Eiffel Tower")
576                .build()
577                .validate()
578                .is_ok()
579        );
580        let err = Thing::builder()
581            .build()
582            .validate()
583            .expect_err("should be missing");
584        assert!(matches!(err, crate::MatchingError::MissingField(_)));
585    }
586
587    #[test]
588    fn thing_round_trips_through_serde() {
589        let t = Thing::builder()
590            .name("Eiffel Tower")
591            .add_alternate_name("La Tour Eiffel")
592            .description("Iron tower in Paris.")
593            .url("https://www.toureiffel.paris/")
594            .add_identifier(Identifier::new("wikidata", "Q243").unwrap())
595            .add_same_as("https://www.wikidata.org/wiki/Q243")
596            .add_additional_type("https://schema.org/Landmark")
597            .build();
598        let json = serde_json::to_string(&t).expect("serialise");
599        let back: Thing = serde_json::from_str(&json).expect("deserialise");
600        assert_eq!(t, back);
601    }
602
603    #[test]
604    fn alternate_names_setter_replaces_vec() {
605        let t = Thing::builder()
606            .alternate_names(vec!["X".into(), "Y".into()])
607            .build();
608        assert_eq!(t.alternate_names, vec!["X".to_string(), "Y".to_string()]);
609    }
610
611    #[test]
612    fn identifier_trims_value_and_property_id() {
613        let id = Identifier::new("  wikidata  ", "   Q243 ").unwrap();
614        assert_eq!(id.property_id, "wikidata");
615        assert_eq!(id.value, "Q243");
616    }
617
618    #[test]
619    fn identifier_rejects_empty_components() {
620        assert!(Identifier::new("wikidata", "").is_none());
621        assert!(Identifier::new("wikidata", "    ").is_none());
622        assert!(Identifier::new("", "Q243").is_none());
623        assert!(Identifier::new("   ", "Q243").is_none());
624    }
625
626    #[test]
627    fn identifier_equality_is_property_scoped() {
628        let g = Identifier::new("google", "X").unwrap();
629        let w = Identifier::new("wikidata", "X").unwrap();
630        assert_ne!(g, w);
631    }
632
633    #[test]
634    fn identifier_round_trips_through_serde() {
635        let id = Identifier::new("custom", "abc-123").unwrap();
636        let json = serde_json::to_string(&id).expect("serialise");
637        let back: Identifier = serde_json::from_str(&json).expect("deserialise");
638        assert_eq!(id, back);
639    }
640}