wme_models/article.rs
1//! Article types for the Wikimedia Enterprise API.
2//!
3//! This module provides the core [`Article`] type used across all Enterprise APIs
4//! (On-demand, Snapshot, and Realtime). The same schema is returned by all endpoints,
5//! allowing you to process articles from any source without separate parsers.
6//!
7//! # Article Structure
8//!
9//! Articles contain comprehensive metadata about a Wikipedia page:
10//! - **Identification**: `identifier`, `name`, `url`
11//! - **Content**: `abstract_text`, `article_body` (HTML/wikitext)
12//! - **Context**: `in_language`, `is_part_of`, `namespace`
13//! - **Version**: `version` with editor info and credibility signals
14//! - **Categorization**: `categories`, `templates`, `redirects`
15//! - **Event metadata**: `event` (Realtime API only)
16//!
17//! # Handling Duplicates
18//!
19//! Snapshot and Realtime Batch files may contain duplicate articles (< 1%).
20//! When processing, keep the article with the highest `version.identifier`:
21//!
22//! ```rust
23//! use wme_models::Article;
24//!
25//! fn keep_latest(existing: &Article, incoming: &Article) -> bool {
26//! incoming.version.identifier > existing.version.identifier
27//! }
28//! ```
29//!
30//! # Visibility Events
31//!
32//! The Realtime API may return `visibility-change` events where content is hidden.
33//! Check the `visibility` field to determine if text, editor, or comment is visible.
34
35use chrono::{DateTime, Utc};
36use serde::{Deserialize, Serialize};
37
38use crate::metadata::EventMetadata;
39use crate::structured::{Infobox, Section, Table};
40use crate::version::Version;
41use crate::{
42 Category, Image, License, Namespace, Redirect, Template, WikidataEntity, WikidataEntityUsage,
43};
44
45/// Visibility flags for articles (present on visibility-change events).
46///
47/// When the editing community flags a revision as containing potentially damaging
48/// information, they change its visibility. The three booleans indicate whether
49/// the article body, editor name, or edit comment may contain harmful data.
50///
51/// # Examples
52///
53/// ```
54/// use wme_models::Visibility;
55///
56/// let visibility = Visibility {
57/// text: true,
58/// editor: false,
59/// comment: false,
60/// };
61///
62/// // When text=false, the article content is hidden
63/// // When editor=false, the editor name is hidden
64/// // When comment=false, the edit summary is hidden
65/// ```
66#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
67pub struct Visibility {
68 /// Is article text visible?
69 pub text: bool,
70 /// Is editor name visible?
71 pub editor: bool,
72 /// Is edit comment visible?
73 pub comment: bool,
74}
75
76/// Project reference (simplified, used in article responses).
77///
78/// This is a lightweight reference to the project an article belongs to.
79/// For full project metadata, see [`crate::metadata::ProjectInfo`].
80#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
81pub struct ProjectRef {
82 /// Project identifier (e.g., "enwiki")
83 pub identifier: String,
84 /// Project URL
85 pub url: Option<String>,
86}
87
88/// Complete article from Enterprise API.
89///
90/// This is the primary data structure returned by all Enterprise APIs.
91/// The same schema is used across On-demand, Snapshot, and Realtime endpoints.
92///
93/// # Example
94///
95/// ```
96/// use wme_models::Article;
97/// use serde_json;
98///
99/// let json = r#"{
100/// "name": "Squirrel",
101/// "identifier": 28492,
102/// "url": "https://en.wikipedia.org/wiki/Squirrel",
103/// "date_created": "2001-01-15T00:00:00Z",
104/// "date_modified": "2024-01-15T12:00:00Z",
105/// "in_language": {"identifier": "en", "name": "English"},
106/// "is_part_of": {"identifier": "enwiki"},
107/// "namespace": {"identifier": 0, "name": ""},
108/// "license": [{"name": "CC BY-SA 4.0", "url": "https://creativecommons.org/licenses/by-sa/4.0/"}],
109/// "version": {
110/// "identifier": 1182847293,
111/// "editor": {"identifier": 12345, "name": "SomeUser"}
112/// }
113/// }"#;
114///
115/// let article: Article = serde_json::from_str(json).unwrap();
116/// assert_eq!(article.name, "Squirrel");
117/// assert_eq!(article.identifier, 28492);
118/// ```
119#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
120pub struct Article {
121 /// Article ID (MediaWiki page ID)
122 pub identifier: u64,
123 /// Article name/title
124 pub name: String,
125 /// Article URL
126 pub url: String,
127 /// Article abstract/summary
128 #[serde(rename = "abstract")]
129 pub abstract_text: Option<String>,
130 /// Short description (e.g., "Family of rodents")
131 pub description: Option<String>,
132 /// Last modification timestamp
133 pub date_modified: DateTime<Utc>,
134 /// Before-last modification timestamp
135 pub date_previously_modified: Option<DateTime<Utc>>,
136 /// Language information
137 pub in_language: crate::metadata::Language,
138 /// Project this article belongs to (simplified reference)
139 pub is_part_of: ProjectRef,
140 /// Namespace information
141 pub namespace: Option<Namespace>,
142 /// Main Wikidata entity (primary topic)
143 pub main_entity: Option<WikidataEntity>,
144 /// Additional Wikidata entities used
145 pub additional_entities: Option<Vec<WikidataEntityUsage>>,
146 /// Categories this article belongs to
147 pub categories: Option<Vec<Category>>,
148 /// Templates used in this article
149 pub templates: Option<Vec<Template>>,
150 /// Redirects to this article (alternative names)
151 pub redirects: Option<Vec<Redirect>>,
152 /// Current version information with credibility signals
153 pub version: Version,
154 /// Previous version information
155 pub previous_version: Option<crate::version::PreviousVersion>,
156 /// Number of editors watching this page
157 pub watchers_count: Option<u64>,
158 /// Protection settings (edit/move restrictions)
159 pub protection: Option<Vec<crate::version::Protection>>,
160 /// Visibility flags (for visibility-change events)
161 pub visibility: Option<Visibility>,
162 /// Main image for the article
163 pub image: Option<Image>,
164 /// License(s) for this article (usually CC-BY-SA)
165 pub license: Vec<License>,
166 /// Article body content (HTML and wikitext)
167 pub article_body: Option<crate::content::ArticleBody>,
168 /// Event metadata (present in Realtime API responses)
169 pub event: Option<EventMetadata>,
170 /// Has parts - structured content sections (when using fields filter)
171 pub has_parts: Option<Vec<crate::structured::Section>>,
172}
173
174/// Structured Contents (BETA) - Article with parsed content.
175///
176/// This type extends [`Article`] with fully parsed content including infoboxes,
177/// sections, and tables. Available through the Structured Contents BETA endpoint.
178///
179/// # Accessing Content
180///
181/// Use the convenience methods to access parsed content:
182///
183/// ```rust,ignore
184/// use wme_models::StructuredArticle;
185///
186/// // Get infobox by name
187/// if let Some(infobox) = article.infobox("Automatic taxobox") {
188/// // Process infobox fields
189/// }
190///
191/// // Get section by name
192/// if let Some(section) = article.section("Taxonomy") {
193/// // Process section content
194/// }
195/// ```
196#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
197pub struct StructuredArticle {
198 /// Base article fields (flattened)
199 #[serde(flatten)]
200 pub base: Article,
201 /// Creation timestamp (first revision) - only in structured contents
202 pub date_created: DateTime<Utc>,
203 /// Parsed infoboxes
204 pub infoboxes: Vec<Infobox>,
205 /// Parsed sections
206 pub sections: Vec<Section>,
207 /// Tables (single object in structured contents)
208 pub tables: Table,
209 /// References/citations (single object in structured contents)
210 pub references: crate::reference::Reference,
211}
212
213impl StructuredArticle {
214 /// Get infobox by name.
215 ///
216 /// # Examples
217 ///
218 /// ```rust,ignore
219 /// use wme_models::StructuredArticle;
220 ///
221 /// if let Some(infobox) = article.infobox("Automatic taxobox") {
222 /// println!("Found infobox: {:?}", infobox.name);
223 /// }
224 /// ```
225 pub fn infobox(&self, name: &str) -> Option<&Infobox> {
226 self.infoboxes
227 .iter()
228 .find(|i| i.name.as_ref().map(|n| n == name).unwrap_or(false))
229 }
230
231 /// Get section by name.
232 ///
233 /// # Examples
234 ///
235 /// ```rust,ignore
236 /// use wme_models::StructuredArticle;
237 ///
238 /// if let Some(section) = article.section("References") {
239 /// println!("Section has {} parts", section.has_parts.as_ref().map(|p| p.len()).unwrap_or(0));
240 /// }
241 /// ```
242 pub fn section(&self, name: &str) -> Option<&Section> {
243 self.sections
244 .iter()
245 .find(|s| s.name.as_ref().map(|n| n == name).unwrap_or(false))
246 }
247
248 /// Get table by identifier.
249 ///
250 /// # Examples
251 ///
252 /// ```rust,ignore
253 /// use wme_models::StructuredArticle;
254 ///
255 /// if let Some(table) = article.table("demographics_table1") {
256 /// println!("Table has {} rows", table.rows.len());
257 /// }
258 /// ```
259 pub fn table(&self, identifier: &str) -> Option<&Table> {
260 if self.tables.identifier == identifier {
261 Some(&self.tables)
262 } else {
263 None
264 }
265 }
266}
267
268impl std::ops::Deref for StructuredArticle {
269 type Target = Article;
270
271 fn deref(&self) -> &Self::Target {
272 &self.base
273 }
274}
275
276#[cfg(test)]
277mod tests {
278 use super::*;
279 use chrono::Utc;
280
281 fn create_test_language() -> crate::metadata::Language {
282 crate::metadata::Language {
283 identifier: Some("en".to_string()),
284 name: Some("English".to_string()),
285 alternate_name: None,
286 direction: Some("ltr".to_string()),
287 }
288 }
289
290 fn create_test_namespace() -> Namespace {
291 Namespace {
292 identifier: 0,
293 name: Some("".to_string()),
294 description: Some("Main namespace".to_string()),
295 }
296 }
297
298 fn create_test_project_ref() -> ProjectRef {
299 ProjectRef {
300 identifier: "enwiki".to_string(),
301 url: Some("https://en.wikipedia.org".to_string()),
302 }
303 }
304
305 fn create_test_version() -> Version {
306 Version {
307 identifier: 1182847293,
308 editor: Some(crate::version::Editor {
309 identifier: Some(12345),
310 name: Some("TestUser".to_string()),
311 is_bot: Some(false),
312 is_anonymous: Some(false),
313 date_started: Some(Utc::now()),
314 edit_count: Some(1000),
315 groups: Some(vec!["user".to_string()]),
316 is_admin: Some(false),
317 is_patroller: Some(false),
318 has_advanced_rights: Some(false),
319 }),
320 comment: Some("Test edit".to_string()),
321 tags: Some(vec!["mobile edit".to_string()]),
322 has_tag_needs_citation: Some(false),
323 is_minor_edit: Some(false),
324 is_flagged_stable: Some(true),
325 is_breaking_news: Some(false),
326 noindex: Some(false),
327 number_of_characters: Some(5000),
328 size: Some(crate::version::ArticleSize {
329 value: 15000,
330 unit_text: "B".to_string(),
331 }),
332 maintenance_tags: None,
333 scores: None,
334 }
335 }
336
337 fn create_test_article() -> Article {
338 Article {
339 identifier: 28492,
340 name: "Squirrel".to_string(),
341 url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
342 abstract_text: Some("Squirrels are members of the family Sciuridae...".to_string()),
343 description: Some("Family of rodents".to_string()),
344 date_modified: Utc::now(),
345 date_previously_modified: None,
346 in_language: create_test_language(),
347 is_part_of: create_test_project_ref(),
348 namespace: Some(create_test_namespace()),
349 main_entity: None,
350 additional_entities: None,
351 categories: None,
352 templates: None,
353 redirects: None,
354 version: create_test_version(),
355 previous_version: None,
356 watchers_count: Some(42),
357 protection: None,
358 visibility: None,
359 image: None,
360 license: vec![],
361 article_body: None,
362 event: None,
363 has_parts: None,
364 }
365 }
366
367 #[test]
368 fn test_article_creation() {
369 let article = create_test_article();
370 assert_eq!(article.identifier, 28492);
371 assert_eq!(article.name, "Squirrel");
372 assert!(article.abstract_text.is_some());
373 }
374
375 #[test]
376 fn test_project_ref_creation() {
377 let project = ProjectRef {
378 identifier: "enwiki".to_string(),
379 url: Some("https://en.wikipedia.org".to_string()),
380 };
381 assert_eq!(project.identifier, "enwiki");
382 assert!(project.url.is_some());
383 }
384
385 #[test]
386 fn test_visibility_creation() {
387 let visibility = Visibility {
388 text: true,
389 editor: false,
390 comment: false,
391 };
392 assert!(visibility.text);
393 assert!(!visibility.editor);
394 assert!(!visibility.comment);
395 }
396
397 #[test]
398 fn test_version_comparison_for_dedup() {
399 let article1 = create_test_article();
400 let article2 = Article {
401 identifier: 28492,
402 name: "Squirrel".to_string(),
403 url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
404 abstract_text: None,
405 description: None,
406 date_modified: Utc::now(),
407 date_previously_modified: None,
408 in_language: create_test_language(),
409 is_part_of: create_test_project_ref(),
410 namespace: Some(create_test_namespace()),
411 main_entity: None,
412 additional_entities: None,
413 categories: None,
414 templates: None,
415 redirects: None,
416 version: Version {
417 identifier: 1182847294, // Higher version
418 editor: create_test_version().editor,
419 comment: None,
420 tags: None,
421 has_tag_needs_citation: None,
422 is_minor_edit: None,
423 is_flagged_stable: None,
424 is_breaking_news: None,
425 noindex: None,
426 number_of_characters: None,
427 size: None,
428 maintenance_tags: None,
429 scores: None,
430 },
431 previous_version: None,
432 watchers_count: None,
433 protection: None,
434 visibility: None,
435 image: None,
436 license: vec![],
437 article_body: None,
438 event: None,
439 has_parts: None,
440 };
441
442 // When deduplicating, keep the one with higher version.identifier
443 assert!(article2.version.identifier > article1.version.identifier);
444 }
445}