thing_matcher/models.rs
1//! Data models for things, aligned with `schema.org/Thing`.
2//!
3//! This module is intentionally **logic-free**: it defines the types that
4//! flow through the matching engine but contains no matching code itself.
5//! See [`crate::matcher`] for the engine and [`crate::normalizer`] for the
6//! text transformations that the matcher applies to these fields.
7//!
8//! All public types here are `Serialize + Deserialize` so they round-trip
9//! through JSON, MessagePack, or any other `serde` format.
10//!
11//! ## Schema.org alignment
12//!
13//! The fields of [`Thing`] correspond to the properties of
14//! `schema.org/Thing`:
15//!
16//! | Rust field | schema.org property |
17//! |---|---|
18//! | `name` | `name` |
19//! | `alternate_names` | `alternateName` |
20//! | `description` | `description` |
21//! | `disambiguating_description` | `disambiguatingDescription` |
22//! | `identifiers` | `identifier` (as `PropertyValue`) |
23//! | `url` | `url` |
24//! | `image` | `image` |
25//! | `same_as` | `sameAs` |
26//! | `main_entity_of_page` | `mainEntityOfPage` |
27//! | `additional_types` | `additionalType` |
28//! | `subject_of` | `subjectOf` |
29//! | `owner` | `owner` |
30//!
31//! ## Building a thing
32//!
33//! Prefer [`Thing::builder`] over constructing the struct literal — the
34//! builder accepts `impl Into<String>` so call-sites can pass `&str`,
35//! `String`, or owned values interchangeably.
36//!
37//! ```
38//! use thing_matcher::Thing;
39//!
40//! let t = Thing::builder()
41//! .name("Eiffel Tower")
42//! .add_alternate_name("La Tour Eiffel")
43//! .url("https://www.toureiffel.paris/")
44//! .build();
45//!
46//! assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
47//! ```
48
49use serde::{Deserialize, Serialize};
50
51/// External identifier for a thing, modelled on `schema.org/PropertyValue`.
52///
53/// Two `Identifier` values are equal iff both their `property_id` and their
54/// `value` are equal. Equality is structural — no per-scheme
55/// canonicalisation is performed.
56///
57/// `property_id` is the issuer or vocabulary that names the identifier,
58/// such as `"wikidata"`, `"isbn"`, `"doi"`, `"gtin"`, or a fully-qualified
59/// URL. `value` is the identifier string itself.
60///
61/// # Example
62///
63/// ```
64/// use thing_matcher::Identifier;
65///
66/// let a = Identifier::new("wikidata", "Q243").unwrap();
67/// let b = Identifier::new("wikidata", " Q243 ").unwrap();
68/// assert_eq!(a, b, "values are trimmed at construction");
69/// assert!(Identifier::new("wikidata", "").is_none());
70/// ```
71#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
72pub struct Identifier {
73 /// Issuer or vocabulary that scopes the identifier, e.g.
74 /// `"wikidata"`, `"isbn"`, `"doi"`, `"gtin"`, or a URL.
75 pub property_id: String,
76 /// The identifier value, trimmed of surrounding whitespace.
77 pub value: String,
78}
79
80impl Identifier {
81 /// Construct an [`Identifier`], trimming the value and property id of
82 /// surrounding whitespace. Returns `None` if either trimmed component
83 /// is empty.
84 ///
85 /// No further normalisation is applied — different vocabularies have
86 /// different rules and the crate makes no assumptions.
87 ///
88 /// # Example
89 ///
90 /// ```
91 /// use thing_matcher::Identifier;
92 ///
93 /// let id = Identifier::new("wikidata", " Q243 ").unwrap();
94 /// assert_eq!(id.value, "Q243");
95 /// assert_eq!(id.property_id, "wikidata");
96 ///
97 /// assert!(Identifier::new("wikidata", " ").is_none());
98 /// assert!(Identifier::new(" ", "Q243").is_none());
99 /// ```
100 pub fn new(property_id: impl Into<String>, value: impl Into<String>) -> Option<Self> {
101 let property_id = property_id.into().trim().to_string();
102 let value = value.into().trim().to_string();
103 if property_id.is_empty() || value.is_empty() {
104 None
105 } else {
106 Some(Self { property_id, value })
107 }
108 }
109}
110
111/// Core data structure for a thing, aligned with `schema.org/Thing`.
112///
113/// Every field is optional (or defaults to empty). The matcher tolerates
114/// missing data field-by-field — a `None` value never penalises a thing.
115/// See [`crate::matcher::MatchingEngine::match_things`] for how missing
116/// fields affect the weighted score.
117///
118/// Construct via [`Thing::builder`] rather than struct literal syntax so
119/// the call-site stays compact and forward-compatible if fields are added.
120///
121/// # Example
122///
123/// ```
124/// use thing_matcher::Thing;
125///
126/// let t = Thing::builder()
127/// .name("Eiffel Tower")
128/// .add_alternate_name("La Tour Eiffel")
129/// .description("Wrought-iron lattice tower on the Champ de Mars in Paris.")
130/// .url("https://www.toureiffel.paris/")
131/// .build();
132///
133/// assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
134/// assert_eq!(t.alternate_names, vec!["La Tour Eiffel".to_string()]);
135/// ```
136///
137/// `Thing` round-trips through `serde`.
138///
139/// ```
140/// # use thing_matcher::Thing;
141/// let t = Thing::builder().name("Eiffel Tower").build();
142/// let json = serde_json::to_string(&t).unwrap();
143/// let back: Thing = serde_json::from_str(&json).unwrap();
144/// assert_eq!(t, back);
145/// ```
146#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
147#[non_exhaustive]
148pub struct Thing {
149 /// Primary canonical name. Corresponds to `schema.org/name`.
150 pub name: Option<String>,
151
152 /// Aliases, endonyms, or translations. Corresponds to
153 /// `schema.org/alternateName`. The matcher takes the best score
154 /// across the cartesian product of primary + alternates.
155 pub alternate_names: Vec<String>,
156
157 /// Free-form description. Corresponds to `schema.org/description`.
158 pub description: Option<String>,
159
160 /// Short disambiguating description. Corresponds to
161 /// `schema.org/disambiguatingDescription`.
162 pub disambiguating_description: Option<String>,
163
164 /// Scheme-scoped external identifiers. Corresponds to
165 /// `schema.org/identifier` modelled as `PropertyValue`. Sharing any
166 /// one `(property_id, value)` pair across two things is a
167 /// deterministic match.
168 pub identifiers: Vec<Identifier>,
169
170 /// Canonical URL of the item. Corresponds to `schema.org/url`.
171 pub url: Option<String>,
172
173 /// URL of a representative image. Corresponds to `schema.org/image`.
174 pub image: Option<String>,
175
176 /// Reference URLs that unambiguously indicate the same item, e.g. a
177 /// Wikipedia article or an authority record. Corresponds to
178 /// `schema.org/sameAs`.
179 pub same_as: Vec<String>,
180
181 /// Page (URL) for which this thing is the main entity. Corresponds
182 /// to `schema.org/mainEntityOfPage`.
183 pub main_entity_of_page: Option<String>,
184
185 /// Additional types from external vocabularies, typically schema.org
186 /// subtypes or other ontology URIs. Corresponds to
187 /// `schema.org/additionalType`.
188 pub additional_types: Vec<String>,
189
190 /// Works or events about this thing (URLs). Corresponds to
191 /// `schema.org/subjectOf`.
192 pub subject_of: Vec<String>,
193
194 /// Person or organisation that owns this thing. Corresponds to
195 /// `schema.org/owner`. Stored as a string (a name or URL) — the
196 /// crate does not model `Person` / `Organization` separately.
197 pub owner: Option<String>,
198
199 /// Local identifier issued by the originating system. Not
200 /// normalised, not scored — different organisations may issue
201 /// colliding values. Kept for round-trip honesty.
202 pub local_id: Option<String>,
203}
204
205impl Thing {
206 /// Begin constructing a [`Thing`] with the [`ThingBuilder`].
207 ///
208 /// All fields default to `None` / empty until a setter is called.
209 ///
210 /// # Example
211 ///
212 /// ```
213 /// use thing_matcher::Thing;
214 ///
215 /// let t = Thing::builder()
216 /// .name("Big Ben")
217 /// .build();
218 ///
219 /// assert_eq!(t.name.as_deref(), Some("Big Ben"));
220 /// ```
221 pub fn builder() -> ThingBuilder {
222 ThingBuilder::default()
223 }
224
225 /// Validate that the thing carries a primary name.
226 ///
227 /// Returns `Ok(())` if `name` is set. Otherwise returns
228 /// [`crate::MatchingError::MissingField`].
229 ///
230 /// This is **not** invoked automatically by the matcher — call it at
231 /// the system boundary when you ingest data, not on every
232 /// comparison.
233 ///
234 /// # Example
235 ///
236 /// ```
237 /// use thing_matcher::Thing;
238 ///
239 /// assert!(Thing::builder().name("Eiffel Tower").build().validate().is_ok());
240 /// assert!(Thing::builder().build().validate().is_err());
241 /// ```
242 pub fn validate(&self) -> crate::Result<()> {
243 if self.name.is_none() {
244 return Err(crate::MatchingError::MissingField(
245 "name is required".to_string(),
246 ));
247 }
248 Ok(())
249 }
250}
251
252/// Fluent builder for [`Thing`].
253///
254/// All string setters accept `impl Into<String>` so call-sites may pass
255/// `&str`, `String`, or `&String` interchangeably without explicit
256/// conversion.
257///
258/// # Example
259///
260/// ```
261/// use thing_matcher::{Thing, ThingBuilder};
262///
263/// let t: Thing = ThingBuilder::default()
264/// .name(String::from("Eiffel Tower"))
265/// .add_alternate_name("La Tour Eiffel")
266/// .url("https://www.toureiffel.paris/")
267/// .add_same_as("https://www.wikidata.org/wiki/Q243")
268/// .build();
269///
270/// assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
271/// assert_eq!(t.same_as.len(), 1);
272/// ```
273#[derive(Default)]
274pub struct ThingBuilder {
275 name: Option<String>,
276 alternate_names: Vec<String>,
277 description: Option<String>,
278 disambiguating_description: Option<String>,
279 identifiers: Vec<Identifier>,
280 url: Option<String>,
281 image: Option<String>,
282 same_as: Vec<String>,
283 main_entity_of_page: Option<String>,
284 additional_types: Vec<String>,
285 subject_of: Vec<String>,
286 owner: Option<String>,
287 local_id: Option<String>,
288}
289
290impl ThingBuilder {
291 /// Set the primary canonical name.
292 ///
293 /// ```
294 /// # use thing_matcher::Thing;
295 /// let t = Thing::builder().name("Eiffel Tower").build();
296 /// assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
297 /// ```
298 pub fn name<S: Into<String>>(mut self, value: S) -> Self {
299 self.name = Some(value.into());
300 self
301 }
302
303 /// Replace the entire list of alternate names.
304 ///
305 /// ```
306 /// # use thing_matcher::Thing;
307 /// let t = Thing::builder()
308 /// .alternate_names(vec!["La Tour Eiffel".into(), "Tour Eiffel".into()])
309 /// .build();
310 /// assert_eq!(t.alternate_names.len(), 2);
311 /// ```
312 pub fn alternate_names(mut self, value: Vec<String>) -> Self {
313 self.alternate_names = value;
314 self
315 }
316
317 /// Append a single alternate name.
318 ///
319 /// ```
320 /// # use thing_matcher::Thing;
321 /// let t = Thing::builder()
322 /// .add_alternate_name("La Tour Eiffel")
323 /// .add_alternate_name("Tour Eiffel")
324 /// .build();
325 /// assert_eq!(t.alternate_names.len(), 2);
326 /// ```
327 pub fn add_alternate_name<S: Into<String>>(mut self, value: S) -> Self {
328 self.alternate_names.push(value.into());
329 self
330 }
331
332 /// Set the free-form description.
333 ///
334 /// ```
335 /// # use thing_matcher::Thing;
336 /// let t = Thing::builder().description("A landmark in Paris.").build();
337 /// assert_eq!(t.description.as_deref(), Some("A landmark in Paris."));
338 /// ```
339 pub fn description<S: Into<String>>(mut self, value: S) -> Self {
340 self.description = Some(value.into());
341 self
342 }
343
344 /// Set the disambiguating description.
345 ///
346 /// ```
347 /// # use thing_matcher::Thing;
348 /// let t = Thing::builder().disambiguating_description("The Paris one.").build();
349 /// assert_eq!(t.disambiguating_description.as_deref(), Some("The Paris one."));
350 /// ```
351 pub fn disambiguating_description<S: Into<String>>(mut self, value: S) -> Self {
352 self.disambiguating_description = Some(value.into());
353 self
354 }
355
356 /// Replace the entire list of identifiers.
357 ///
358 /// ```
359 /// # use thing_matcher::{Thing, Identifier};
360 /// let id = Identifier::new("wikidata", "Q243").unwrap();
361 /// let t = Thing::builder().identifiers(vec![id.clone()]).build();
362 /// assert_eq!(t.identifiers, vec![id]);
363 /// ```
364 pub fn identifiers(mut self, value: Vec<Identifier>) -> Self {
365 self.identifiers = value;
366 self
367 }
368
369 /// Append a single identifier.
370 ///
371 /// ```
372 /// # use thing_matcher::{Thing, Identifier};
373 /// let t = Thing::builder()
374 /// .add_identifier(Identifier::new("wikidata", "Q243").unwrap())
375 /// .build();
376 /// assert_eq!(t.identifiers.len(), 1);
377 /// ```
378 pub fn add_identifier(mut self, value: Identifier) -> Self {
379 self.identifiers.push(value);
380 self
381 }
382
383 /// Set the canonical URL.
384 ///
385 /// ```
386 /// # use thing_matcher::Thing;
387 /// let t = Thing::builder().url("https://www.toureiffel.paris/").build();
388 /// assert_eq!(t.url.as_deref(), Some("https://www.toureiffel.paris/"));
389 /// ```
390 pub fn url<S: Into<String>>(mut self, value: S) -> Self {
391 self.url = Some(value.into());
392 self
393 }
394
395 /// Set the representative image URL.
396 ///
397 /// ```
398 /// # use thing_matcher::Thing;
399 /// let t = Thing::builder().image("https://example.org/eiffel.jpg").build();
400 /// assert_eq!(t.image.as_deref(), Some("https://example.org/eiffel.jpg"));
401 /// ```
402 pub fn image<S: Into<String>>(mut self, value: S) -> Self {
403 self.image = Some(value.into());
404 self
405 }
406
407 /// Replace the entire `sameAs` list.
408 ///
409 /// ```
410 /// # use thing_matcher::Thing;
411 /// let t = Thing::builder()
412 /// .same_as(vec![
413 /// "https://www.wikidata.org/wiki/Q243".into(),
414 /// "https://en.wikipedia.org/wiki/Eiffel_Tower".into(),
415 /// ])
416 /// .build();
417 /// assert_eq!(t.same_as.len(), 2);
418 /// ```
419 pub fn same_as(mut self, value: Vec<String>) -> Self {
420 self.same_as = value;
421 self
422 }
423
424 /// Append a single `sameAs` URL.
425 ///
426 /// ```
427 /// # use thing_matcher::Thing;
428 /// let t = Thing::builder()
429 /// .add_same_as("https://www.wikidata.org/wiki/Q243")
430 /// .build();
431 /// assert_eq!(t.same_as.len(), 1);
432 /// ```
433 pub fn add_same_as<S: Into<String>>(mut self, value: S) -> Self {
434 self.same_as.push(value.into());
435 self
436 }
437
438 /// Set the `mainEntityOfPage` URL.
439 ///
440 /// ```
441 /// # use thing_matcher::Thing;
442 /// let t = Thing::builder()
443 /// .main_entity_of_page("https://en.wikipedia.org/wiki/Eiffel_Tower")
444 /// .build();
445 /// assert_eq!(
446 /// t.main_entity_of_page.as_deref(),
447 /// Some("https://en.wikipedia.org/wiki/Eiffel_Tower"),
448 /// );
449 /// ```
450 pub fn main_entity_of_page<S: Into<String>>(mut self, value: S) -> Self {
451 self.main_entity_of_page = Some(value.into());
452 self
453 }
454
455 /// Replace the entire list of `additionalType` URIs.
456 ///
457 /// ```
458 /// # use thing_matcher::Thing;
459 /// let t = Thing::builder()
460 /// .additional_types(vec!["https://schema.org/Landmark".into()])
461 /// .build();
462 /// assert_eq!(t.additional_types.len(), 1);
463 /// ```
464 pub fn additional_types(mut self, value: Vec<String>) -> Self {
465 self.additional_types = value;
466 self
467 }
468
469 /// Append a single `additionalType` URI.
470 ///
471 /// ```
472 /// # use thing_matcher::Thing;
473 /// let t = Thing::builder()
474 /// .add_additional_type("https://schema.org/Landmark")
475 /// .build();
476 /// assert_eq!(t.additional_types.len(), 1);
477 /// ```
478 pub fn add_additional_type<S: Into<String>>(mut self, value: S) -> Self {
479 self.additional_types.push(value.into());
480 self
481 }
482
483 /// Replace the entire `subjectOf` list.
484 pub fn subject_of(mut self, value: Vec<String>) -> Self {
485 self.subject_of = value;
486 self
487 }
488
489 /// Append a single `subjectOf` URL.
490 pub fn add_subject_of<S: Into<String>>(mut self, value: S) -> Self {
491 self.subject_of.push(value.into());
492 self
493 }
494
495 /// Set the owner (person or organisation, as a string).
496 pub fn owner<S: Into<String>>(mut self, value: S) -> Self {
497 self.owner = Some(value.into());
498 self
499 }
500
501 /// Set the local identifier.
502 ///
503 /// ```
504 /// # use thing_matcher::Thing;
505 /// let t = Thing::builder().local_id("REF-12345").build();
506 /// assert_eq!(t.local_id.as_deref(), Some("REF-12345"));
507 /// ```
508 pub fn local_id<S: Into<String>>(mut self, value: S) -> Self {
509 self.local_id = Some(value.into());
510 self
511 }
512
513 /// Consume the builder and produce the [`Thing`].
514 ///
515 /// ```
516 /// # use thing_matcher::Thing;
517 /// let t = Thing::builder().name("Big Ben").build();
518 /// assert!(t.url.is_none());
519 /// ```
520 pub fn build(self) -> Thing {
521 Thing {
522 name: self.name,
523 alternate_names: self.alternate_names,
524 description: self.description,
525 disambiguating_description: self.disambiguating_description,
526 identifiers: self.identifiers,
527 url: self.url,
528 image: self.image,
529 same_as: self.same_as,
530 main_entity_of_page: self.main_entity_of_page,
531 additional_types: self.additional_types,
532 subject_of: self.subject_of,
533 owner: self.owner,
534 local_id: self.local_id,
535 }
536 }
537}
538
539#[cfg(test)]
540mod tests {
541 use super::*;
542
543 #[test]
544 fn thing_builder_starts_empty() {
545 let t = Thing::builder().build();
546 assert!(t.name.is_none());
547 assert!(t.alternate_names.is_empty());
548 assert!(t.description.is_none());
549 assert!(t.disambiguating_description.is_none());
550 assert!(t.identifiers.is_empty());
551 assert!(t.url.is_none());
552 assert!(t.image.is_none());
553 assert!(t.same_as.is_empty());
554 assert!(t.main_entity_of_page.is_none());
555 assert!(t.additional_types.is_empty());
556 assert!(t.subject_of.is_empty());
557 assert!(t.owner.is_none());
558 assert!(t.local_id.is_none());
559 }
560
561 #[test]
562 fn thing_builder_accepts_str_and_string() {
563 let t = Thing::builder()
564 .name("Eiffel Tower")
565 .add_alternate_name(String::from("La Tour Eiffel"))
566 .build();
567 assert_eq!(t.name.as_deref(), Some("Eiffel Tower"));
568 assert_eq!(t.alternate_names, vec!["La Tour Eiffel".to_string()]);
569 }
570
571 #[test]
572 fn thing_validate_requires_a_name() {
573 assert!(
574 Thing::builder()
575 .name("Eiffel Tower")
576 .build()
577 .validate()
578 .is_ok()
579 );
580 let err = Thing::builder()
581 .build()
582 .validate()
583 .expect_err("should be missing");
584 assert!(matches!(err, crate::MatchingError::MissingField(_)));
585 }
586
587 #[test]
588 fn thing_round_trips_through_serde() {
589 let t = Thing::builder()
590 .name("Eiffel Tower")
591 .add_alternate_name("La Tour Eiffel")
592 .description("Iron tower in Paris.")
593 .url("https://www.toureiffel.paris/")
594 .add_identifier(Identifier::new("wikidata", "Q243").unwrap())
595 .add_same_as("https://www.wikidata.org/wiki/Q243")
596 .add_additional_type("https://schema.org/Landmark")
597 .build();
598 let json = serde_json::to_string(&t).expect("serialise");
599 let back: Thing = serde_json::from_str(&json).expect("deserialise");
600 assert_eq!(t, back);
601 }
602
603 #[test]
604 fn alternate_names_setter_replaces_vec() {
605 let t = Thing::builder()
606 .alternate_names(vec!["X".into(), "Y".into()])
607 .build();
608 assert_eq!(t.alternate_names, vec!["X".to_string(), "Y".to_string()]);
609 }
610
611 #[test]
612 fn identifier_trims_value_and_property_id() {
613 let id = Identifier::new(" wikidata ", " Q243 ").unwrap();
614 assert_eq!(id.property_id, "wikidata");
615 assert_eq!(id.value, "Q243");
616 }
617
618 #[test]
619 fn identifier_rejects_empty_components() {
620 assert!(Identifier::new("wikidata", "").is_none());
621 assert!(Identifier::new("wikidata", " ").is_none());
622 assert!(Identifier::new("", "Q243").is_none());
623 assert!(Identifier::new(" ", "Q243").is_none());
624 }
625
626 #[test]
627 fn identifier_equality_is_property_scoped() {
628 let g = Identifier::new("google", "X").unwrap();
629 let w = Identifier::new("wikidata", "X").unwrap();
630 assert_ne!(g, w);
631 }
632
633 #[test]
634 fn identifier_round_trips_through_serde() {
635 let id = Identifier::new("custom", "abc-123").unwrap();
636 let json = serde_json::to_string(&id).expect("serialise");
637 let back: Identifier = serde_json::from_str(&json).expect("deserialise");
638 assert_eq!(id, back);
639 }
640}