Skip to main content

wme_models/
metadata.rs

1//! Metadata types for the Wikimedia Enterprise API.
2//!
3//! This module provides types for reference data, snapshots, and event metadata
4//! used across all Enterprise APIs. These types describe the context around
5//! articles rather than article content itself.
6//!
7//! # Namespaces
8//!
9//! Wikimedia projects use namespaces to organize different types of pages:
10//! - **0 (Main)**: Article content
11//! - **6 (File)**: Media files
12//! - **10 (Template)**: Reusable wikitext
13//! - **14 (Category)**: Topic groupings
14//!
15//! See [`Namespace`] for namespace information.
16//!
17//! # Snapshots
18//!
19//! Snapshots provide complete dumps of Wikimedia projects. Use [`SnapshotInfo`]
20//! to discover available snapshots and their download chunks.
21//!
22//! # Events
23//!
24//! The Realtime API uses event-based updates. See [`EventMetadata`] for
25//! partition/offset tracking to enable resume functionality.
26
27use chrono::{DateTime, Utc};
28use serde::{Deserialize, Serialize};
29
30/// Event types for article changes.
31///
32/// The Realtime API streams events of these types as they happen.
33/// Event types indicate what kind of change occurred to an article.
34#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
35pub enum EventType {
36    /// Article was created or updated (content changed)
37    #[serde(rename = "update")]
38    Update,
39    /// Article was deleted
40    #[serde(rename = "delete")]
41    Delete,
42    /// Article visibility changed (content/editor/comment hidden)
43    #[serde(rename = "visibility-change")]
44    VisibilityChange,
45}
46
47/// Event metadata for articles.
48///
49/// Tracks events through the Wikimedia Enterprise system. The partition and
50/// offset fields enable resuming streams after disconnections.
51///
52/// # Resuming Streams
53///
54/// ```ignore
55/// // Store the last processed offset for each partition
56/// let offsets: HashMap<u32, u64> = load_checkpoints();
57///
58/// // When reconnecting, pass these offsets
59/// client.connect_with_offsets(offsets).await?;
60/// ```
61#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
62pub struct EventMetadata {
63    /// Event UUID (for tracking through the system)
64    pub identifier: String,
65    /// Event type (update, delete, visibility-change)
66    #[serde(rename = "type")]
67    pub event_type: EventType,
68    /// Event creation timestamp (when entered Enterprise system)
69    pub date_created: DateTime<Utc>,
70    /// Event publication timestamp (Realtime API only)
71    pub date_published: Option<DateTime<Utc>>,
72    /// Partition number (Realtime API only, for resume)
73    pub partition: Option<u32>,
74    /// Offset within partition (Realtime API only, for resume)
75    pub offset: Option<u64>,
76}
77
78/// Namespace information.
79///
80/// Namespaces organize different types of pages in Wikimedia projects.
81/// The identifier is the namespace number (0, 6, 10, 14, etc.).
82///
83/// # Common Namespaces
84///
85/// - `0` - Main (article content)
86/// - `6` - File (media uploads)
87/// - `10` - Template (reusable wikitext)
88/// - `14` - Category (topic groupings)
89#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
90pub struct Namespace {
91    /// Namespace ID (0 = Main, 6 = File, 10 = Template, 14 = Category)
92    pub identifier: u32,
93    /// Namespace name
94    pub name: Option<String>,
95    /// Namespace description
96    pub description: Option<String>,
97}
98
99/// Language information.
100///
101/// Describes a language supported by Wikimedia projects.
102#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
103pub struct Language {
104    /// Language code (e.g., "en", "de", "fr")
105    pub identifier: Option<String>,
106    /// Language name in English
107    pub name: Option<String>,
108    /// Alternate language name (native script)
109    pub alternate_name: Option<String>,
110    /// Text direction ("ltr" for left-to-right, "rtl" for right-to-left)
111    pub direction: Option<String>,
112}
113
114/// Project information.
115///
116/// Simple project reference with basic metadata.
117/// For full project metadata including language, see [`ProjectInfo`].
118#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
119pub struct Project {
120    /// Project identifier (e.g., "enwiki")
121    pub identifier: String,
122    /// Project URL
123    pub url: String,
124    /// Project type (e.g., "Wikipedia")
125    #[serde(rename = "type")]
126    pub project_type: Option<String>,
127}
128
129/// Project info for discovery API.
130///
131/// Full project metadata returned by the metadata endpoints.
132#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
133pub struct ProjectInfo {
134    /// Project identifier (e.g., "enwiki")
135    pub identifier: String,
136    /// Project code (e.g., "wikipedia")
137    pub code: Option<String>,
138    /// Project name
139    pub name: Option<String>,
140    /// Project URL
141    pub url: Option<String>,
142    /// Project language
143    pub in_language: Option<Language>,
144}
145
146/// Project type (code) information.
147///
148/// Describes project types like "wiki" (Wikipedia) or "wiktionary".
149#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
150pub struct ProjectType {
151    /// Type identifier (e.g., "wiki", "wiktionary")
152    pub identifier: String,
153    /// Type name
154    pub name: String,
155    /// Type description
156    pub description: Option<String>,
157}
158
159/// File size information with unit.
160///
161/// Used in snapshot and batch metadata to describe download sizes.
162#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
163pub struct Size {
164    /// Unit of measurement (e.g., "bytes", "MB", "GB")
165    pub unit_text: String,
166    /// Size value (can be fractional for small sizes)
167    pub value: f64,
168}
169
170/// Snapshot information.
171///
172/// Metadata about available project snapshots including download chunks
173/// for parallel downloading.
174///
175/// # Example
176///
177/// ```ignore
178/// // Get snapshot info and download chunks in parallel
179/// for chunk in snapshot.chunks {
180///     let data = download_chunk(chunk.url).await?;
181///     process_chunk(data).await?;
182/// }
183/// ```
184#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
185pub struct SnapshotInfo {
186    /// Snapshot identifier (e.g., "enwiki_namespace_0")
187    pub identifier: String,
188    /// Snapshot version
189    pub version: String,
190    /// Last modification timestamp
191    pub date_modified: DateTime<Utc>,
192    /// Language of the snapshot content
193    pub in_language: Language,
194    /// Project this snapshot belongs to
195    pub is_part_of: ProjectInfo,
196    /// Namespace of the snapshot
197    pub namespace: Namespace,
198    /// Snapshot size information
199    pub size: Size,
200    /// Downloadable chunk identifiers for parallel download
201    pub chunks: Vec<String>,
202}
203
204/// Chunk information for parallel downloads.
205///
206/// Large snapshots are split into chunks for parallel downloading.
207/// Each chunk can be downloaded independently and contains a subset
208/// of the articles.
209#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
210pub struct ChunkInfo {
211    /// Chunk identifier (e.g., "enwiki_namespace_0_chunk_0")
212    pub identifier: String,
213    /// Chunk version
214    pub version: String,
215    /// Last modification timestamp
216    pub date_modified: DateTime<Utc>,
217    /// Language of the chunk content
218    pub in_language: Language,
219    /// Project this chunk belongs to
220    pub is_part_of: ProjectInfo,
221    /// Namespace of the chunk
222    pub namespace: Namespace,
223    /// Chunk size information
224    pub size: Size,
225    /// Download URL for the chunk
226    pub url: Option<String>,
227}
228
229/// Simplified language information (used in Realtime API).
230///
231/// Realtime API uses a minimal language representation.
232#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
233pub struct SimplifiedLanguage {
234    /// Language code (e.g., "en", "de")
235    pub identifier: String,
236    /// Language name
237    pub name: String,
238}
239
240/// Realtime project information (used in streaming API).
241///
242/// Realtime API uses a different project structure with version
243/// and size information for the stream.
244#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
245pub struct RealtimeProject {
246    /// Project identifier (e.g., "enwiki")
247    pub identifier: String,
248    /// Project name
249    pub name: String,
250    /// Project URL
251    pub url: String,
252    /// Project version
253    pub version: String,
254    /// Last modification timestamp
255    pub date_modified: DateTime<Utc>,
256    /// Project size
257    pub size: Size,
258    /// Language information (simplified)
259    pub in_language: SimplifiedLanguage,
260}
261
262/// Simplified namespace (used in Realtime API).
263///
264/// Realtime API uses minimal namespace representation.
265#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
266pub struct SimplifiedNamespace {
267    /// Namespace ID
268    pub identifier: u32,
269    /// Namespace name
270    pub name: Option<String>,
271}
272
273/// Realtime batch info (hourly batches).
274///
275/// Metadata for hourly batch files available through the Realtime API.
276#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
277pub struct RealtimeBatchInfo {
278    /// Batch identifier
279    pub identifier: String,
280    /// Batch name
281    pub name: String,
282    /// Batch version
283    pub version: String,
284    /// Language information
285    pub in_language: Language,
286    /// Project this batch belongs to
287    pub is_part_of: ProjectInfo,
288    /// Namespace information
289    pub namespace: Namespace,
290    /// Batch size information
291    pub size: Size,
292}
293
294/// Batch information for Realtime API.
295///
296/// Basic batch metadata with event count.
297#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
298pub struct BatchInfo {
299    /// Batch identifier
300    pub identifier: String,
301    /// Batch creation timestamp
302    pub date_created: DateTime<Utc>,
303    /// Number of events in batch
304    pub event_count: u64,
305}
306
307/// Article update event for Realtime API.
308///
309/// Combines article data with event metadata for Realtime streams.
310#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
311pub struct ArticleUpdate {
312    /// Event type
313    pub event_type: EventType,
314    /// Article data
315    pub article: crate::Article,
316    /// Event metadata
317    pub event_metadata: EventMetadata,
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323    use chrono::Utc;
324
325    #[test]
326    fn test_event_type_variants() {
327        let update = EventType::Update;
328        let delete = EventType::Delete;
329        let visibility = EventType::VisibilityChange;
330
331        assert!(matches!(update, EventType::Update));
332        assert!(matches!(delete, EventType::Delete));
333        assert!(matches!(visibility, EventType::VisibilityChange));
334    }
335
336    #[test]
337    fn test_event_metadata() {
338        let event = EventMetadata {
339            identifier: "evt-12345".to_string(),
340            event_type: EventType::Update,
341            date_created: Utc::now(),
342            date_published: Some(Utc::now()),
343            partition: Some(4),
344            offset: Some(3593806),
345        };
346
347        assert_eq!(event.identifier, "evt-12345");
348        assert_eq!(event.partition, Some(4));
349        assert_eq!(event.offset, Some(3593806));
350    }
351
352    #[test]
353    fn test_namespace_creation() {
354        let ns = Namespace {
355            identifier: 0,
356            name: Some("".to_string()),
357            description: Some("Main namespace".to_string()),
358        };
359
360        assert_eq!(ns.identifier, 0);
361        assert_eq!(ns.name, Some("".to_string()));
362    }
363
364    #[test]
365    fn test_language_creation() {
366        let lang = Language {
367            identifier: Some("en".to_string()),
368            name: Some("English".to_string()),
369            alternate_name: None,
370            direction: Some("ltr".to_string()),
371        };
372
373        assert_eq!(lang.identifier, Some("en".to_string()));
374        assert_eq!(lang.direction, Some("ltr".to_string()));
375    }
376
377    #[test]
378    fn test_rtl_language() {
379        let lang = Language {
380            identifier: Some("ar".to_string()),
381            name: Some("Arabic".to_string()),
382            alternate_name: Some("العربية".to_string()),
383            direction: Some("rtl".to_string()),
384        };
385
386        assert_eq!(lang.identifier, Some("ar".to_string()));
387        assert_eq!(lang.direction, Some("rtl".to_string()));
388    }
389
390    #[test]
391    fn test_project_info() {
392        let project = ProjectInfo {
393            identifier: "enwiki".to_string(),
394            code: Some("wikipedia".to_string()),
395            name: Some("English Wikipedia".to_string()),
396            url: Some("https://en.wikipedia.org".to_string()),
397            in_language: Some(Language {
398                identifier: Some("en".to_string()),
399                name: Some("English".to_string()),
400                alternate_name: None,
401                direction: Some("ltr".to_string()),
402            }),
403        };
404
405        assert_eq!(project.identifier, "enwiki");
406        assert_eq!(project.code, Some("wikipedia".to_string()));
407    }
408
409    #[test]
410    fn test_size() {
411        let size = Size {
412            unit_text: "MB".to_string(),
413            value: 1500.0,
414        };
415
416        assert_eq!(size.value, 1500.0);
417        assert_eq!(size.unit_text, "MB");
418    }
419
420    #[test]
421    fn test_snapshot_info() {
422        let snapshot = SnapshotInfo {
423            identifier: "enwiki_namespace_0".to_string(),
424            version: "2024-01-15".to_string(),
425            date_modified: Utc::now(),
426            in_language: Language {
427                identifier: Some("en".to_string()),
428                name: Some("English".to_string()),
429                alternate_name: None,
430                direction: Some("ltr".to_string()),
431            },
432            is_part_of: ProjectInfo {
433                identifier: "enwiki".to_string(),
434                code: Some("wikipedia".to_string()),
435                name: Some("English Wikipedia".to_string()),
436                url: Some("https://en.wikipedia.org".to_string()),
437                in_language: Some(Language {
438                    identifier: Some("en".to_string()),
439                    name: Some("English".to_string()),
440                    alternate_name: None,
441                    direction: Some("ltr".to_string()),
442                }),
443            },
444            namespace: Namespace {
445                identifier: 0,
446                name: Some("".to_string()),
447                description: Some("Main namespace".to_string()),
448            },
449            size: Size {
450                unit_text: "GB".to_string(),
451                value: 25.0,
452            },
453            chunks: vec![],
454        };
455
456        assert_eq!(snapshot.identifier, "enwiki_namespace_0");
457        assert_eq!(snapshot.chunks.len(), 0);
458    }
459}