Skip to main content

wme_models/
article.rs

1//! Article types for the Wikimedia Enterprise API.
2//!
3//! This module provides the core [`Article`] type used across all Enterprise APIs
4//! (On-demand, Snapshot, and Realtime). The same schema is returned by all endpoints,
5//! allowing you to process articles from any source without separate parsers.
6//!
7//! # Article Structure
8//!
9//! Articles contain comprehensive metadata about a Wikipedia page:
10//! - **Identification**: `identifier`, `name`, `url`
11//! - **Content**: `abstract_text`, `article_body` (HTML/wikitext)
12//! - **Context**: `in_language`, `is_part_of`, `namespace`
13//! - **Version**: `version` with editor info and credibility signals
14//! - **Categorization**: `categories`, `templates`, `redirects`
15//! - **Event metadata**: `event` (Realtime API only)
16//!
17//! # Handling Duplicates
18//!
19//! Snapshot and Realtime Batch files may contain duplicate articles (< 1%).
20//! When processing, keep the article with the highest `version.identifier`:
21//!
22//! ```rust
23//! use wme_models::Article;
24//!
25//! fn keep_latest(existing: &Article, incoming: &Article) -> bool {
26//!     incoming.version.identifier > existing.version.identifier
27//! }
28//! ```
29//!
30//! # Visibility Events
31//!
32//! The Realtime API may return `visibility-change` events where content is hidden.
33//! Check the `visibility` field to determine if text, editor, or comment is visible.
34
35use chrono::{DateTime, Utc};
36use serde::{Deserialize, Serialize};
37
38use crate::metadata::EventMetadata;
39use crate::structured::{Infobox, Section, Table};
40use crate::version::Version;
41use crate::{
42    Category, Image, License, Namespace, Redirect, Template, WikidataEntity, WikidataEntityUsage,
43};
44
45/// Visibility flags for articles (present on visibility-change events).
46///
47/// When the editing community flags a revision as containing potentially damaging
48/// information, they change its visibility. The three booleans indicate whether
49/// the article body, editor name, or edit comment may contain harmful data.
50///
51/// # Examples
52///
53/// ```
54/// use wme_models::Visibility;
55///
56/// let visibility = Visibility {
57///     text: true,
58///     editor: false,
59///     comment: false,
60/// };
61///
62/// // When text=false, the article content is hidden
63/// // When editor=false, the editor name is hidden
64/// // When comment=false, the edit summary is hidden
65/// ```
66#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
67pub struct Visibility {
68    /// Is article text visible?
69    pub text: bool,
70    /// Is editor name visible?
71    pub editor: bool,
72    /// Is edit comment visible?
73    pub comment: bool,
74}
75
76/// Project reference (simplified, used in article responses).
77///
78/// This is a lightweight reference to the project an article belongs to.
79/// For full project metadata, see [`crate::metadata::ProjectInfo`].
80#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
81pub struct ProjectRef {
82    /// Project identifier (e.g., "enwiki")
83    pub identifier: String,
84    /// Project URL
85    pub url: Option<String>,
86}
87
88/// Complete article from Enterprise API.
89///
90/// This is the primary data structure returned by all Enterprise APIs.
91/// The same schema is used across On-demand, Snapshot, and Realtime endpoints.
92///
93/// # Example
94///
95/// ```
96/// use wme_models::Article;
97/// use serde_json;
98///
99/// let json = r#"{
100///     "name": "Squirrel",
101///     "identifier": 28492,
102///     "url": "https://en.wikipedia.org/wiki/Squirrel",
103///     "date_created": "2001-01-15T00:00:00Z",
104///     "date_modified": "2024-01-15T12:00:00Z",
105///     "in_language": {"identifier": "en", "name": "English"},
106///     "is_part_of": {"identifier": "enwiki"},
107///     "namespace": {"identifier": 0, "name": ""},
108///     "license": [{"name": "CC BY-SA 4.0", "url": "https://creativecommons.org/licenses/by-sa/4.0/"}],
109///     "version": {
110///         "identifier": 1182847293,
111///         "editor": {"identifier": 12345, "name": "SomeUser"}
112///     }
113/// }"#;
114///
115/// let article: Article = serde_json::from_str(json).unwrap();
116/// assert_eq!(article.name, "Squirrel");
117/// assert_eq!(article.identifier, 28492);
118/// ```
119#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
120pub struct Article {
121    /// Article ID (MediaWiki page ID)
122    pub identifier: u64,
123    /// Article name/title
124    pub name: String,
125    /// Article URL
126    pub url: String,
127    /// Article abstract/summary
128    #[serde(rename = "abstract")]
129    pub abstract_text: Option<String>,
130    /// Short description (e.g., "Family of rodents")
131    pub description: Option<String>,
132    /// Last modification timestamp
133    pub date_modified: DateTime<Utc>,
134    /// Before-last modification timestamp
135    pub date_previously_modified: Option<DateTime<Utc>>,
136    /// Language information
137    pub in_language: crate::metadata::Language,
138    /// Project this article belongs to (simplified reference)
139    pub is_part_of: ProjectRef,
140    /// Namespace information
141    pub namespace: Option<Namespace>,
142    /// Main Wikidata entity (primary topic)
143    pub main_entity: Option<WikidataEntity>,
144    /// Additional Wikidata entities used
145    pub additional_entities: Option<Vec<WikidataEntityUsage>>,
146    /// Categories this article belongs to
147    pub categories: Option<Vec<Category>>,
148    /// Templates used in this article
149    pub templates: Option<Vec<Template>>,
150    /// Redirects to this article (alternative names)
151    pub redirects: Option<Vec<Redirect>>,
152    /// Current version information with credibility signals
153    pub version: Version,
154    /// Previous version information
155    pub previous_version: Option<crate::version::PreviousVersion>,
156    /// Number of editors watching this page
157    pub watchers_count: Option<u64>,
158    /// Protection settings (edit/move restrictions)
159    pub protection: Option<Vec<crate::version::Protection>>,
160    /// Visibility flags (for visibility-change events)
161    pub visibility: Option<Visibility>,
162    /// Main image for the article
163    pub image: Option<Image>,
164    /// License(s) for this article (usually CC-BY-SA)
165    pub license: Vec<License>,
166    /// Article body content (HTML and wikitext)
167    pub article_body: Option<crate::content::ArticleBody>,
168    /// Event metadata (present in Realtime API responses)
169    pub event: Option<EventMetadata>,
170    /// Has parts - structured content sections (when using fields filter)
171    pub has_parts: Option<Vec<crate::structured::Section>>,
172}
173
174/// Structured Contents (BETA) - Article with parsed content.
175///
176/// This type extends [`Article`] with fully parsed content including infoboxes,
177/// sections, and tables. Available through the Structured Contents BETA endpoint.
178///
179/// # Accessing Content
180///
181/// Use the convenience methods to access parsed content:
182///
183/// ```rust,ignore
184/// use wme_models::StructuredArticle;
185///
186/// // Get infobox by name
187/// if let Some(infobox) = article.infobox("Automatic taxobox") {
188///     // Process infobox fields
189/// }
190///
191/// // Get section by name
192/// if let Some(section) = article.section("Taxonomy") {
193///     // Process section content
194/// }
195/// ```
196#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
197pub struct StructuredArticle {
198    /// Base article fields (flattened)
199    #[serde(flatten)]
200    pub base: Article,
201    /// Creation timestamp (first revision) - only in structured contents
202    pub date_created: DateTime<Utc>,
203    /// Parsed infoboxes
204    pub infoboxes: Vec<Infobox>,
205    /// Parsed sections
206    pub sections: Vec<Section>,
207    /// Tables (single object in structured contents)
208    pub tables: Table,
209    /// References/citations (single object in structured contents)
210    pub references: crate::reference::Reference,
211}
212
213impl StructuredArticle {
214    /// Get infobox by name.
215    ///
216    /// # Examples
217    ///
218    /// ```rust,ignore
219    /// use wme_models::StructuredArticle;
220    ///
221    /// if let Some(infobox) = article.infobox("Automatic taxobox") {
222    ///     println!("Found infobox: {:?}", infobox.name);
223    /// }
224    /// ```
225    pub fn infobox(&self, name: &str) -> Option<&Infobox> {
226        self.infoboxes
227            .iter()
228            .find(|i| i.name.as_ref().map(|n| n == name).unwrap_or(false))
229    }
230
231    /// Get section by name.
232    ///
233    /// # Examples
234    ///
235    /// ```rust,ignore
236    /// use wme_models::StructuredArticle;
237    ///
238    /// if let Some(section) = article.section("References") {
239    ///     println!("Section has {} parts", section.has_parts.as_ref().map(|p| p.len()).unwrap_or(0));
240    /// }
241    /// ```
242    pub fn section(&self, name: &str) -> Option<&Section> {
243        self.sections
244            .iter()
245            .find(|s| s.name.as_ref().map(|n| n == name).unwrap_or(false))
246    }
247
248    /// Get table by identifier.
249    ///
250    /// # Examples
251    ///
252    /// ```rust,ignore
253    /// use wme_models::StructuredArticle;
254    ///
255    /// if let Some(table) = article.table("demographics_table1") {
256    ///     println!("Table has {} rows", table.rows.len());
257    /// }
258    /// ```
259    pub fn table(&self, identifier: &str) -> Option<&Table> {
260        if self.tables.identifier == identifier {
261            Some(&self.tables)
262        } else {
263            None
264        }
265    }
266}
267
268impl std::ops::Deref for StructuredArticle {
269    type Target = Article;
270
271    fn deref(&self) -> &Self::Target {
272        &self.base
273    }
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279    use chrono::Utc;
280
281    fn create_test_language() -> crate::metadata::Language {
282        crate::metadata::Language {
283            identifier: Some("en".to_string()),
284            name: Some("English".to_string()),
285            alternate_name: None,
286            direction: Some("ltr".to_string()),
287        }
288    }
289
290    fn create_test_namespace() -> Namespace {
291        Namespace {
292            identifier: 0,
293            name: Some("".to_string()),
294            description: Some("Main namespace".to_string()),
295        }
296    }
297
298    fn create_test_project_ref() -> ProjectRef {
299        ProjectRef {
300            identifier: "enwiki".to_string(),
301            url: Some("https://en.wikipedia.org".to_string()),
302        }
303    }
304
305    fn create_test_version() -> Version {
306        Version {
307            identifier: 1182847293,
308            editor: Some(crate::version::Editor {
309                identifier: Some(12345),
310                name: Some("TestUser".to_string()),
311                is_bot: Some(false),
312                is_anonymous: Some(false),
313                date_started: Some(Utc::now()),
314                edit_count: Some(1000),
315                groups: Some(vec!["user".to_string()]),
316                is_admin: Some(false),
317                is_patroller: Some(false),
318                has_advanced_rights: Some(false),
319            }),
320            comment: Some("Test edit".to_string()),
321            tags: Some(vec!["mobile edit".to_string()]),
322            has_tag_needs_citation: Some(false),
323            is_minor_edit: Some(false),
324            is_flagged_stable: Some(true),
325            is_breaking_news: Some(false),
326            noindex: Some(false),
327            number_of_characters: Some(5000),
328            size: Some(crate::version::ArticleSize {
329                value: 15000,
330                unit_text: "B".to_string(),
331            }),
332            maintenance_tags: None,
333            scores: None,
334        }
335    }
336
337    fn create_test_article() -> Article {
338        Article {
339            identifier: 28492,
340            name: "Squirrel".to_string(),
341            url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
342            abstract_text: Some("Squirrels are members of the family Sciuridae...".to_string()),
343            description: Some("Family of rodents".to_string()),
344            date_modified: Utc::now(),
345            date_previously_modified: None,
346            in_language: create_test_language(),
347            is_part_of: create_test_project_ref(),
348            namespace: Some(create_test_namespace()),
349            main_entity: None,
350            additional_entities: None,
351            categories: None,
352            templates: None,
353            redirects: None,
354            version: create_test_version(),
355            previous_version: None,
356            watchers_count: Some(42),
357            protection: None,
358            visibility: None,
359            image: None,
360            license: vec![],
361            article_body: None,
362            event: None,
363            has_parts: None,
364        }
365    }
366
367    #[test]
368    fn test_article_creation() {
369        let article = create_test_article();
370        assert_eq!(article.identifier, 28492);
371        assert_eq!(article.name, "Squirrel");
372        assert!(article.abstract_text.is_some());
373    }
374
375    #[test]
376    fn test_project_ref_creation() {
377        let project = ProjectRef {
378            identifier: "enwiki".to_string(),
379            url: Some("https://en.wikipedia.org".to_string()),
380        };
381        assert_eq!(project.identifier, "enwiki");
382        assert!(project.url.is_some());
383    }
384
385    #[test]
386    fn test_visibility_creation() {
387        let visibility = Visibility {
388            text: true,
389            editor: false,
390            comment: false,
391        };
392        assert!(visibility.text);
393        assert!(!visibility.editor);
394        assert!(!visibility.comment);
395    }
396
397    #[test]
398    fn test_version_comparison_for_dedup() {
399        let article1 = create_test_article();
400        let article2 = Article {
401            identifier: 28492,
402            name: "Squirrel".to_string(),
403            url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
404            abstract_text: None,
405            description: None,
406            date_modified: Utc::now(),
407            date_previously_modified: None,
408            in_language: create_test_language(),
409            is_part_of: create_test_project_ref(),
410            namespace: Some(create_test_namespace()),
411            main_entity: None,
412            additional_entities: None,
413            categories: None,
414            templates: None,
415            redirects: None,
416            version: Version {
417                identifier: 1182847294, // Higher version
418                editor: create_test_version().editor,
419                comment: None,
420                tags: None,
421                has_tag_needs_citation: None,
422                is_minor_edit: None,
423                is_flagged_stable: None,
424                is_breaking_news: None,
425                noindex: None,
426                number_of_characters: None,
427                size: None,
428                maintenance_tags: None,
429                scores: None,
430            },
431            previous_version: None,
432            watchers_count: None,
433            protection: None,
434            visibility: None,
435            image: None,
436            license: vec![],
437            article_body: None,
438            event: None,
439            has_parts: None,
440        };
441
442        // When deduplicating, keep the one with higher version.identifier
443        assert!(article2.version.identifier > article1.version.identifier);
444    }
445}