Skip to main content

schemaorg_rs/
types.rs

1//! Core data types for Schema.org structured data extraction.
2//!
3//! This module defines the shared data model used across all extraction
4//! formats (JSON-LD, Microdata, `RDFa` Lite). The central type is
5//! [`SchemaNode`], which represents a single structured data entity
6//! (e.g. a `Product`, an `Offer`). Nodes contain typed [`SchemaValue`]s
7//! organized in insertion-ordered property maps.
8//!
9//! # Data Model
10//!
11//! ```text
12//! StructuredDataGraph
13//!   +---- Vec<SchemaNode>
14//!         +---- types: ["Product"]
15//!         +---- properties: { "name" -> [Text("Widget")],
16//!         |                  "offers" -> [Node(Offer { ... })] }
17//!         +---- source_format: JsonLd
18//!         +---- source_location: Some({ line: 3, column: 1, byte_offset: 42 })
19//! ```
20//!
21//! # Examples
22//!
23//! ```
24//! use schemaorg_rs::types::{SchemaNode, SchemaValue, SourceFormat};
25//! use indexmap::IndexMap;
26//!
27//! let node = SchemaNode {
28//!     types: vec!["Product".into()],
29//!     properties: IndexMap::from([(
30//!         "name".into(),
31//!         vec![SchemaValue::Text("Widget".into())],
32//!     )]),
33//!     source_format: SourceFormat::JsonLd,
34//!     source_location: None,
35//! };
36//!
37//! assert_eq!(node.types, vec!["Product"]);
38//! ```
39
40use std::fmt;
41
42use indexmap::IndexMap;
43use serde::{Deserialize, Serialize};
44
45/// Source format of the extracted structured data.
46///
47/// Indicates which HTML markup format a [`SchemaNode`] was extracted from.
48///
49/// # Examples
50///
51/// ```
52/// use schemaorg_rs::types::SourceFormat;
53///
54/// let format = SourceFormat::JsonLd;
55/// assert_eq!(format.to_string(), "JSON-LD");
56/// ```
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
58#[non_exhaustive]
59pub enum SourceFormat {
60    /// JSON-LD (`<script type="application/ld+json">`)
61    JsonLd,
62    /// HTML Microdata (`itemscope`, `itemprop`)
63    Microdata,
64    /// `RDFa` Lite 1.1 (`vocab`, `typeof`, `property`)
65    RdfaLite,
66}
67
68/// Location in the original HTML document.
69///
70/// Used to map extracted data back to the source markup for diagnostics
71/// and error reporting.
72///
73/// # Examples
74///
75/// ```
76/// use schemaorg_rs::types::SourceLocation;
77///
78/// let loc = SourceLocation { line: 5, column: 3, byte_offset: 120 };
79/// assert_eq!(loc.line, 5);
80/// ```
81#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
82pub struct SourceLocation {
83    /// 1-indexed line number.
84    pub line: usize,
85    /// 1-indexed column number.
86    pub column: usize,
87    /// 0-indexed byte offset from the start of the HTML document.
88    pub byte_offset: usize,
89}
90
91/// A value within a structured data node.
92///
93/// Represents the different value types that a Schema.org property can hold.
94/// Properties are multi-valued (stored as `Vec<SchemaValue>` in [`SchemaNode`]).
95///
96/// # `PartialEq` note
97///
98/// The `Number(f64)` variant uses `f64` partial equality via the derived impl.
99/// This means `NaN != NaN`, which is acceptable for test assertions but not for
100/// production equality checks.
101///
102/// # Examples
103///
104/// ```
105/// use schemaorg_rs::types::SchemaValue;
106///
107/// let text = SchemaValue::Text("Widget".into());
108/// let url = SchemaValue::Url("https://example.com".into());
109/// let flag = SchemaValue::Boolean(true);
110/// let price = SchemaValue::Number(29.99);
111/// ```
112#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
113#[non_exhaustive]
114pub enum SchemaValue {
115    /// Plain text content.
116    Text(String),
117    /// A URL value (starts with `http://`, `https://`, or `mailto:`).
118    Url(String),
119    /// A nested structured data node.
120    Node(Box<SchemaNode>),
121    /// A boolean value.
122    Boolean(bool),
123    /// A numeric value (IEEE 754 `f64`).
124    Number(f64),
125    /// A raw datetime string. Actual datetime validation happens in M2.
126    DateTime(String),
127}
128
129/// A single structured data node (e.g. a `Product`, an `Offer`).
130///
131/// Each node represents one Schema.org entity extracted from the HTML
132/// document, retaining its [`SourceFormat`] so callers can distinguish
133/// which markup produced it.
134///
135/// # Examples
136///
137/// ```
138/// use schemaorg_rs::types::{SchemaNode, SchemaValue, SourceFormat};
139/// use indexmap::IndexMap;
140///
141/// let node = SchemaNode {
142/// types: vec!["Product".into()],
143/// properties: IndexMap::from([(
144/// "name".into(),
145/// vec![SchemaValue::Text("Widget".into())],
146/// )]),
147/// source_format: SourceFormat::JsonLd,
148/// source_location: None,
149/// };
150///
151/// assert_eq!(node.id(), None);
152/// ```
153#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
154pub struct SchemaNode {
155    /// Schema.org type(s), e.g. `["Product", "IndividualProduct"]`.
156    pub types: Vec<String>,
157    /// Properties: key -> list of values (insertion-ordered).
158    pub properties: IndexMap<String, Vec<SchemaValue>>,
159    /// Source format that this node was extracted from.
160    pub source_format: SourceFormat,
161    /// Location in the original HTML document.
162    pub source_location: Option<SourceLocation>,
163}
164
165impl SchemaNode {
166    /// Returns the `@id` of this node, if present.
167    ///
168    /// # Examples
169    ///
170    /// ```
171    /// use schemaorg_rs::types::{SchemaNode, SchemaValue, SourceFormat};
172    /// use indexmap::IndexMap;
173    ///
174    /// let node = SchemaNode {
175    /// types: vec!["Product".into()],
176    /// properties: IndexMap::from([(
177    /// "@id".into(),
178    /// vec![SchemaValue::Text("#product1".into())],
179    /// )]),
180    /// source_format: SourceFormat::JsonLd,
181    /// source_location: None,
182    /// };
183    ///
184    /// assert_eq!(node.id(), Some("#product1"));
185    /// ```
186    #[must_use]
187    #[inline]
188    pub fn id(&self) -> Option<&str> {
189        self.properties
190            .get("@id")
191            .and_then(|vals| vals.first())
192            .and_then(|v| match v {
193                SchemaValue::Text(s) => Some(s.as_str()),
194                _ => None,
195            })
196    }
197}
198
199impl fmt::Display for SourceFormat {
200    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
201        match self {
202            Self::JsonLd => write!(f, "JSON-LD"),
203            Self::Microdata => write!(f, "Microdata"),
204            Self::RdfaLite => write!(f, "RDFa Lite"),
205        }
206    }
207}
208
209impl fmt::Display for SchemaValue {
210    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
211        match self {
212            Self::Text(s) | Self::Url(s) | Self::DateTime(s) => write!(f, "{s}"),
213            Self::Boolean(b) => write!(f, "{b}"),
214            Self::Number(n) => write!(f, "{n}"),
215            Self::Node(n) => write!(f, "[{} node]", n.types.join(", ")),
216        }
217    }
218}