wme_models/metadata.rs
1//! Metadata types for the Wikimedia Enterprise API.
2//!
3//! This module provides types for reference data, snapshots, and event metadata
4//! used across all Enterprise APIs. These types describe the context around
5//! articles rather than article content itself.
6//!
7//! # Namespaces
8//!
9//! Wikimedia projects use namespaces to organize different types of pages:
10//! - **0 (Main)**: Article content
11//! - **6 (File)**: Media files
12//! - **10 (Template)**: Reusable wikitext
13//! - **14 (Category)**: Topic groupings
14//!
15//! See [`Namespace`] for namespace information.
16//!
17//! # Snapshots
18//!
19//! Snapshots provide complete dumps of Wikimedia projects. Use [`SnapshotInfo`]
20//! to discover available snapshots and their download chunks.
21//!
22//! # Events
23//!
24//! The Realtime API uses event-based updates. See [`EventMetadata`] for
25//! partition/offset tracking to enable resume functionality.
26
27use crate::Article;
28use chrono::{DateTime, Utc};
29use serde::{Deserialize, Serialize};
30
31/// Event types for article changes.
32///
33/// The Realtime API streams events of these types as they happen.
34/// Event types indicate what kind of change occurred to an article.
35#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
36pub enum EventType {
37 /// Article was created or updated (content changed)
38 #[serde(rename = "update")]
39 Update,
40 /// Article was deleted
41 #[serde(rename = "delete")]
42 Delete,
43 /// Article visibility changed (content/editor/comment hidden)
44 #[serde(rename = "visibility-change")]
45 VisibilityChange,
46}
47
48/// Event metadata for articles.
49///
50/// Tracks events through the Wikimedia Enterprise system. The partition and
51/// offset fields enable resuming streams after disconnections.
52///
53/// # Resuming Streams
54///
55/// ```ignore
56/// // Store the last processed offset for each partition
57/// let offsets: HashMap<u32, u64> = load_checkpoints();
58///
59/// // When reconnecting, pass these offsets
60/// client.connect_with_offsets(offsets).await?;
61/// ```
62#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
63pub struct EventMetadata {
64 /// Event UUID (for tracking through the system)
65 pub identifier: String,
66 /// Event type (update, delete, visibility-change)
67 #[serde(rename = "type")]
68 pub event_type: EventType,
69 /// Event creation timestamp (when entered Enterprise system)
70 pub date_created: DateTime<Utc>,
71 /// Event publication timestamp (Realtime API only)
72 pub date_published: Option<DateTime<Utc>>,
73 /// Partition number (Realtime API only, for resume)
74 pub partition: Option<u32>,
75 /// Offset within partition (Realtime API only, for resume)
76 pub offset: Option<u64>,
77}
78
79/// Namespace information.
80///
81/// Namespaces organize different types of pages in Wikimedia projects.
82/// The identifier is the namespace number (0, 6, 10, 14, etc.).
83///
84/// # Common Namespaces
85///
86/// - `0` - Main (article content)
87/// - `6` - File (media uploads)
88/// - `10` - Template (reusable wikitext)
89/// - `14` - Category (topic groupings)
90#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
91pub struct Namespace {
92 /// Namespace ID (0 = Main, 6 = File, 10 = Template, 14 = Category)
93 pub identifier: u32,
94 /// Namespace name
95 pub name: Option<String>,
96 /// Namespace description
97 pub description: Option<String>,
98}
99
100/// Language information.
101///
102/// Describes a language supported by Wikimedia projects.
103#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
104pub struct Language {
105 /// Language code (e.g., "en", "de", "fr")
106 pub identifier: Option<String>,
107 /// Language name in English
108 pub name: Option<String>,
109 /// Alternate language name (native script)
110 pub alternate_name: Option<String>,
111 /// Text direction ("ltr" for left-to-right, "rtl" for right-to-left)
112 pub direction: Option<String>,
113}
114
115/// Project information.
116///
117/// Simple project reference with basic metadata.
118/// For full project metadata including language, see [`ProjectInfo`].
119#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
120pub struct Project {
121 /// Project identifier (e.g., "enwiki")
122 pub identifier: String,
123 /// Project URL
124 pub url: String,
125 /// Project type (e.g., "Wikipedia")
126 #[serde(rename = "type")]
127 pub project_type: Option<String>,
128}
129
130/// Project info for discovery API.
131///
132/// Full project metadata returned by the metadata endpoints.
133#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
134pub struct ProjectInfo {
135 /// Project identifier (e.g., "enwiki")
136 pub identifier: String,
137 /// Project code (e.g., "wikipedia")
138 pub code: Option<String>,
139 /// Project name
140 pub name: Option<String>,
141 /// Project URL
142 pub url: Option<String>,
143 /// Project language
144 pub in_language: Option<Language>,
145}
146
147/// Project type (code) information.
148///
149/// Describes project types like "wiki" (Wikipedia) or "wiktionary".
150#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
151pub struct ProjectType {
152 /// Type identifier (e.g., "wiki", "wiktionary")
153 pub identifier: String,
154 /// Type name
155 pub name: String,
156 /// Type description
157 pub description: Option<String>,
158}
159
160/// File size information with unit.
161///
162/// Used in snapshot and batch metadata to describe download sizes.
163#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
164pub struct Size {
165 /// Unit of measurement (e.g., "bytes", "MB", "GB")
166 pub unit_text: String,
167 /// Size value (can be fractional for small sizes)
168 pub value: f64,
169}
170
171/// Snapshot information.
172///
173/// Metadata about available project snapshots including download chunks
174/// for parallel downloading.
175///
176/// # Example
177///
178/// ```ignore
179/// // Get snapshot info and download chunks in parallel
180/// for chunk in snapshot.chunks {
181/// let data = download_chunk(chunk.url).await?;
182/// process_chunk(data).await?;
183/// }
184/// ```
185#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
186pub struct SnapshotInfo {
187 /// Snapshot identifier (e.g., "enwiki_namespace_0")
188 pub identifier: String,
189 /// Snapshot version
190 pub version: String,
191 /// Last modification timestamp
192 pub date_modified: DateTime<Utc>,
193 /// Language of the snapshot content
194 pub in_language: Language,
195 /// Project this snapshot belongs to
196 pub is_part_of: ProjectInfo,
197 /// Namespace of the snapshot
198 pub namespace: Namespace,
199 /// Snapshot size information
200 pub size: Size,
201 /// Downloadable chunk identifiers for parallel download
202 pub chunks: Option<Vec<String>>,
203}
204
205/// Chunk information for parallel downloads.
206///
207/// Large snapshots are split into chunks for parallel downloading.
208/// Each chunk can be downloaded independently and contains a subset
209/// of the articles.
210#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
211pub struct ChunkInfo {
212 /// Chunk identifier (e.g., "enwiki_namespace_0_chunk_0")
213 pub identifier: String,
214 /// Chunk version
215 pub version: String,
216 /// Last modification timestamp
217 pub date_modified: DateTime<Utc>,
218 /// Language of the chunk content
219 pub in_language: Language,
220 /// Project this chunk belongs to
221 pub is_part_of: ProjectInfo,
222 /// Namespace of the chunk
223 pub namespace: Namespace,
224 /// Chunk size information
225 pub size: Size,
226 /// Download URL for the chunk
227 pub url: Option<String>,
228}
229
230/// Simplified language information (used in Realtime API).
231///
232/// Realtime API uses a minimal language representation.
233#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
234pub struct SimplifiedLanguage {
235 /// Language code (e.g., "en", "de")
236 pub identifier: String,
237 /// Language name
238 pub name: String,
239}
240
241/// Realtime project information (used in streaming API).
242///
243/// Realtime API uses a different project structure with version
244/// and size information for the stream.
245#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
246pub struct RealtimeProject {
247 /// Project identifier (e.g., "enwiki")
248 pub identifier: String,
249 /// Project name
250 pub name: String,
251 /// Project URL
252 pub url: String,
253 /// Project version
254 pub version: String,
255 /// Last modification timestamp
256 pub date_modified: DateTime<Utc>,
257 /// Project size
258 pub size: Size,
259 /// Language information (simplified)
260 pub in_language: SimplifiedLanguage,
261}
262
263/// Simplified namespace (used in Realtime API).
264///
265/// Realtime API uses minimal namespace representation.
266#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
267pub struct SimplifiedNamespace {
268 /// Namespace ID
269 pub identifier: u32,
270 /// Namespace name
271 pub name: Option<String>,
272}
273
274/// Realtime batch info (hourly batches).
275///
276/// Metadata for hourly batch files available through the Realtime API.
277#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
278pub struct RealtimeBatchInfo {
279 /// Batch identifier
280 pub identifier: String,
281 /// Batch name
282 pub name: String,
283 /// Batch version
284 pub version: String,
285 /// Language information
286 pub in_language: Language,
287 /// Project this batch belongs to
288 pub is_part_of: ProjectInfo,
289 /// Namespace information
290 pub namespace: Namespace,
291 /// Batch size information
292 pub size: Size,
293}
294
295/// Batch information for Realtime API.
296///
297/// Basic batch metadata with event count.
298#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
299pub struct BatchInfo {
300 /// Batch identifier
301 pub identifier: String,
302 /// Batch creation timestamp
303 pub date_created: DateTime<Utc>,
304 /// Number of events in batch
305 pub event_count: u64,
306}
307
308/// Article update event for Realtime API.
309///
310/// Combines article data with event metadata for Realtime streams.
311#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
312pub struct ArticleUpdate {
313 /// Event type
314 pub event_type: EventType,
315 /// Article data
316 pub article: Article,
317 /// Event metadata
318 pub event_metadata: EventMetadata,
319}
320
321#[cfg(test)]
322mod tests {
323 use super::*;
324 use chrono::Utc;
325
326 #[test]
327 fn test_event_type_variants() {
328 let update = EventType::Update;
329 let delete = EventType::Delete;
330 let visibility = EventType::VisibilityChange;
331
332 assert!(matches!(update, EventType::Update));
333 assert!(matches!(delete, EventType::Delete));
334 assert!(matches!(visibility, EventType::VisibilityChange));
335 }
336
337 #[test]
338 fn test_event_metadata() {
339 let event = EventMetadata {
340 identifier: "evt-12345".to_string(),
341 event_type: EventType::Update,
342 date_created: Utc::now(),
343 date_published: Some(Utc::now()),
344 partition: Some(4),
345 offset: Some(3593806),
346 };
347
348 assert_eq!(event.identifier, "evt-12345");
349 assert_eq!(event.partition, Some(4));
350 assert_eq!(event.offset, Some(3593806));
351 }
352
353 #[test]
354 fn test_namespace_creation() {
355 let ns = Namespace {
356 identifier: 0,
357 name: Some("".to_string()),
358 description: Some("Main namespace".to_string()),
359 };
360
361 assert_eq!(ns.identifier, 0);
362 assert_eq!(ns.name, Some("".to_string()));
363 }
364
365 #[test]
366 fn test_language_creation() {
367 let lang = Language {
368 identifier: Some("en".to_string()),
369 name: Some("English".to_string()),
370 alternate_name: None,
371 direction: Some("ltr".to_string()),
372 };
373
374 assert_eq!(lang.identifier, Some("en".to_string()));
375 assert_eq!(lang.direction, Some("ltr".to_string()));
376 }
377
378 #[test]
379 fn test_rtl_language() {
380 let lang = Language {
381 identifier: Some("ar".to_string()),
382 name: Some("Arabic".to_string()),
383 alternate_name: Some("العربية".to_string()),
384 direction: Some("rtl".to_string()),
385 };
386
387 assert_eq!(lang.identifier, Some("ar".to_string()));
388 assert_eq!(lang.direction, Some("rtl".to_string()));
389 }
390
391 #[test]
392 fn test_project_info() {
393 let project = ProjectInfo {
394 identifier: "enwiki".to_string(),
395 code: Some("wikipedia".to_string()),
396 name: Some("English Wikipedia".to_string()),
397 url: Some("https://en.wikipedia.org".to_string()),
398 in_language: Some(Language {
399 identifier: Some("en".to_string()),
400 name: Some("English".to_string()),
401 alternate_name: None,
402 direction: Some("ltr".to_string()),
403 }),
404 };
405
406 assert_eq!(project.identifier, "enwiki");
407 assert_eq!(project.code, Some("wikipedia".to_string()));
408 }
409
410 #[test]
411 fn test_size() {
412 let size = Size {
413 unit_text: "MB".to_string(),
414 value: 1500.0,
415 };
416
417 assert_eq!(size.value, 1500.0);
418 assert_eq!(size.unit_text, "MB");
419 }
420
421 #[test]
422 fn test_snapshot_info() {
423 let snapshot = SnapshotInfo {
424 identifier: "enwiki_namespace_0".to_string(),
425 version: "2024-01-15".to_string(),
426 date_modified: Utc::now(),
427 in_language: Language {
428 identifier: Some("en".to_string()),
429 name: Some("English".to_string()),
430 alternate_name: None,
431 direction: Some("ltr".to_string()),
432 },
433 is_part_of: ProjectInfo {
434 identifier: "enwiki".to_string(),
435 code: Some("wikipedia".to_string()),
436 name: Some("English Wikipedia".to_string()),
437 url: Some("https://en.wikipedia.org".to_string()),
438 in_language: Some(Language {
439 identifier: Some("en".to_string()),
440 name: Some("English".to_string()),
441 alternate_name: None,
442 direction: Some("ltr".to_string()),
443 }),
444 },
445 namespace: Namespace {
446 identifier: 0,
447 name: Some("".to_string()),
448 description: Some("Main namespace".to_string()),
449 },
450 size: Size {
451 unit_text: "GB".to_string(),
452 value: 25.0,
453 },
454 chunks: Some(vec![]),
455 };
456
457 assert_eq!(snapshot.identifier, "enwiki_namespace_0");
458 assert_eq!(snapshot.chunks.as_ref().map(|v| v.len()).unwrap_or(0), 0);
459 }
460}