Skip to main content

wme_stream/
visitor.rs

1//! Visitor trait for article processing.
2//!
3//! The visitor pattern allows processing articles incrementally without
4//! fully materializing them in memory. This is useful for:
5//!
6//! - **Graph building** - Extract nodes and edges without storing articles
7//! - **Statistics** - Count occurrences without keeping articles
8//! - **Filtering** - Selectively process specific fields
9//! - **Large datasets** - Process snapshots larger than available RAM
10//!
11//! # How It Works
12//!
13//! The visitor is called for each component of an article:
14//! 1. `visit_article_start()` - New article begins
15//! 2. `visit_category()`, `visit_link()`, etc. - Components discovered
16//! 3. `visit_article_end()` - Article complete
17//!
18//! # Implementing a Visitor
19//!
20//! ```rust
21//! use wme_stream::ArticleVisitor;
22//! use serde_json::Value;
23//!
24//! struct MyVisitor {
25//!     article_count: u64,
26//!     category_count: u64,
27//! }
28//!
29//! impl ArticleVisitor for MyVisitor {
30//!     fn visit_article_start(&mut self, _id: u64, _name: &str) {
31//!         self.article_count += 1;
32//!     }
33//!
34//!     fn visit_category(&mut self, _name: &str, _url: &str) {
35//!         self.category_count += 1;
36//!     }
37//!
38//!     fn visit_link(&mut self, _text: &str, _url: &str) {}
39//!     fn visit_infobox(&mut self, _name: &str, _value: &str) {}
40//!     fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
41//!     fn visit_article_end(&mut self) {}
42//! }
43//! ```
44//!
45//! # Included Visitors
46//!
47//! - `NoOpVisitor` - Does nothing (useful for testing)
48//! - `StatsVisitor` - Counts articles, categories, links, etc.
49
50use serde_json::Value;
51
52/// Visitor trait for extracting data without full article materialization.
53///
54/// This trait allows incremental processing of articles, enabling
55/// graph building, statistics collection, and other use cases without
56/// keeping full articles in memory.
57///
58/// # Usage
59///
60/// Implement this trait to process specific parts of articles:
61/// - Override methods for components you care about
62/// - Leave others as no-ops
63/// - Store state in your struct fields
64///
65/// # Example
66///
67/// ```rust
68/// use wme_stream::ArticleVisitor;
69/// use serde_json::Value;
70///
71/// struct CategoryCollector {
72///     categories: Vec<String>,
73/// }
74///
75/// impl ArticleVisitor for CategoryCollector {
76///     fn visit_article_start(&mut self, _id: u64, _name: &str) {}
77///
78///     fn visit_category(&mut self, name: &str, _url: &str) {
79///         self.categories.push(name.to_string());
80///     }
81///
82///     fn visit_link(&mut self, _text: &str, _url: &str) {}
83///     fn visit_infobox(&mut self, _name: &str, _value: &str) {}
84///     fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
85///     fn visit_article_end(&mut self) {}
86/// }
87/// ```
88pub trait ArticleVisitor {
89    /// Called when starting to process an article.
90    ///
91    /// # Arguments
92    ///
93    /// * `id` - Article identifier
94    /// * `name` - Article title
95    fn visit_article_start(&mut self, id: u64, name: &str);
96
97    /// Called when encountering a category.
98    ///
99    /// # Arguments
100    ///
101    /// * `name` - Category name (e.g., "Category:Living people")
102    /// * `url` - Category URL
103    fn visit_category(&mut self, name: &str, url: &str);
104
105    /// Called when encountering a link.
106    ///
107    /// # Arguments
108    ///
109    /// * `text` - Link text
110    /// * `url` - Link URL
111    fn visit_link(&mut self, text: &str, url: &str);
112
113    /// Called when encountering an infobox field.
114    ///
115    /// # Arguments
116    ///
117    /// * `name` - Field name
118    /// * `value` - Field value
119    fn visit_infobox(&mut self, name: &str, value: &str);
120
121    /// Called when encountering a reference.
122    ///
123    /// # Arguments
124    ///
125    /// * `id` - Reference identifier
126    /// * `ref_type` - Reference type ("web", "book", "text")
127    /// * `metadata` - Reference metadata (author, title, URL, etc.)
128    fn visit_reference(&mut self, id: &str, ref_type: &str, metadata: &Value);
129
130    /// Called when finished processing an article.
131    ///
132    /// Use this to finalize per-article processing.
133    fn visit_article_end(&mut self);
134}
135
136/// A no-op visitor that does nothing.
137///
138/// Useful for:
139/// - Testing
140/// - Benchmarking
141/// - As a base for selective implementation
142///
143/// # Example
144///
145/// ```rust
146/// use wme_stream::NoOpVisitor;
147///
148/// let mut visitor = NoOpVisitor;
149/// // All methods are no-ops
150/// ```
151pub struct NoOpVisitor;
152
153impl ArticleVisitor for NoOpVisitor {
154    fn visit_article_start(&mut self, _id: u64, _name: &str) {}
155    fn visit_category(&mut self, _name: &str, _url: &str) {}
156    fn visit_link(&mut self, _text: &str, _url: &str) {}
157    fn visit_infobox(&mut self, _name: &str, _value: &str) {}
158    fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
159    fn visit_article_end(&mut self) {}
160}
161
162/// A visitor that collects counts only (low memory footprint).
163///
164/// Tracks aggregate statistics without storing individual articles.
165/// Useful for getting overview statistics of a snapshot.
166///
167/// # Example
168///
169/// ```rust
170/// use wme_stream::{ArticleVisitor, StatsVisitor};
171/// use serde_json::Value;
172///
173/// let mut visitor = StatsVisitor::new();
174///
175/// // Simulate visiting articles
176/// for i in 0..100 {
177///     visitor.visit_article_start(i as u64, "Article");
178///     visitor.visit_category("Category:Test", "https://en.wikipedia.org/wiki/Category:Test");
179///     visitor.visit_link("Link", "https://en.wikipedia.org/wiki/Link");
180///     visitor.visit_article_end();
181/// }
182///
183/// assert_eq!(visitor.article_count, 100);
184/// assert_eq!(visitor.category_count, 100);
185/// assert_eq!(visitor.link_count, 100);
186/// ```
187pub struct StatsVisitor {
188    /// Total articles processed
189    pub article_count: u64,
190    /// Total categories found
191    pub category_count: u64,
192    /// Total links found
193    pub link_count: u64,
194    /// Total infoboxes found
195    pub infobox_count: u64,
196    /// Total references found
197    pub reference_count: u64,
198}
199
200impl StatsVisitor {
201    /// Create a new stats visitor with all counts at zero.
202    ///
203    /// # Example
204    ///
205    /// ```rust
206    /// use wme_stream::StatsVisitor;
207    ///
208    /// let visitor = StatsVisitor::new();
209    /// assert_eq!(visitor.article_count, 0);
210    /// assert_eq!(visitor.category_count, 0);
211    /// ```
212    pub fn new() -> Self {
213        Self {
214            article_count: 0,
215            category_count: 0,
216            link_count: 0,
217            infobox_count: 0,
218            reference_count: 0,
219        }
220    }
221}
222
223impl ArticleVisitor for StatsVisitor {
224    fn visit_article_start(&mut self, _id: u64, _name: &str) {
225        self.article_count += 1;
226    }
227
228    fn visit_category(&mut self, _name: &str, _url: &str) {
229        self.category_count += 1;
230    }
231
232    fn visit_link(&mut self, _text: &str, _url: &str) {
233        self.link_count += 1;
234    }
235
236    fn visit_infobox(&mut self, _name: &str, _value: &str) {
237        self.infobox_count += 1;
238    }
239
240    fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {
241        self.reference_count += 1;
242    }
243
244    fn visit_article_end(&mut self) {}
245}
246
247impl Default for StatsVisitor {
248    fn default() -> Self {
249        Self::new()
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    #[test]
258    fn test_noop_visitor() {
259        let mut visitor = NoOpVisitor;
260
261        // Should not panic
262        visitor.visit_article_start(1, "Test");
263        visitor.visit_category("Cat", "https://example.com");
264        visitor.visit_link("Link", "https://example.com");
265        visitor.visit_infobox("Name", "Value");
266        visitor.visit_reference("ref1", "web", &Value::Null);
267        visitor.visit_article_end();
268    }
269
270    #[test]
271    fn test_stats_visitor_new() {
272        let visitor = StatsVisitor::new();
273        assert_eq!(visitor.article_count, 0);
274        assert_eq!(visitor.category_count, 0);
275        assert_eq!(visitor.link_count, 0);
276        assert_eq!(visitor.infobox_count, 0);
277        assert_eq!(visitor.reference_count, 0);
278    }
279
280    #[test]
281    fn test_stats_visitor_counts() {
282        let mut visitor = StatsVisitor::new();
283
284        // Simulate visiting articles
285        for i in 0..10 {
286            visitor.visit_article_start(i, &format!("Article {}", i));
287
288            // Each article has 2 categories
289            visitor.visit_category("Category:A", "https://example.com/A");
290            visitor.visit_category("Category:B", "https://example.com/B");
291
292            // Each article has 3 links
293            visitor.visit_link("Link1", "https://example.com/1");
294            visitor.visit_link("Link2", "https://example.com/2");
295            visitor.visit_link("Link3", "https://example.com/3");
296
297            // Each article has 1 infobox
298            visitor.visit_infobox("Name", "Value");
299
300            // Each article has 2 references
301            visitor.visit_reference(&format!("ref{}", i * 2), "web", &Value::Null);
302            visitor.visit_reference(&format!("ref{}", i * 2 + 1), "book", &Value::Null);
303
304            visitor.visit_article_end();
305        }
306
307        assert_eq!(visitor.article_count, 10);
308        assert_eq!(visitor.category_count, 20); // 10 * 2
309        assert_eq!(visitor.link_count, 30); // 10 * 3
310        assert_eq!(visitor.infobox_count, 10); // 10 * 1
311        assert_eq!(visitor.reference_count, 20); // 10 * 2
312    }
313
314    #[test]
315    fn test_stats_visitor_default() {
316        let visitor: StatsVisitor = Default::default();
317        assert_eq!(visitor.article_count, 0);
318        assert_eq!(visitor.category_count, 0);
319    }
320
321    // Test a custom visitor implementation
322    struct TestVisitor {
323        last_article_id: Option<u64>,
324        last_article_name: Option<String>,
325        categories: Vec<String>,
326    }
327
328    impl ArticleVisitor for TestVisitor {
329        fn visit_article_start(&mut self, id: u64, name: &str) {
330            self.last_article_id = Some(id);
331            self.last_article_name = Some(name.to_string());
332        }
333
334        fn visit_category(&mut self, name: &str, _url: &str) {
335            self.categories.push(name.to_string());
336        }
337
338        fn visit_link(&mut self, _text: &str, _url: &str) {}
339        fn visit_infobox(&mut self, _name: &str, _value: &str) {}
340        fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
341        fn visit_article_end(&mut self) {}
342    }
343
344    #[test]
345    fn test_custom_visitor() {
346        let mut visitor = TestVisitor {
347            last_article_id: None,
348            last_article_name: None,
349            categories: Vec::new(),
350        };
351
352        visitor.visit_article_start(42, "Test Article");
353        visitor.visit_category("Category:Test", "https://example.com");
354        visitor.visit_article_end();
355
356        assert_eq!(visitor.last_article_id, Some(42));
357        assert_eq!(visitor.last_article_name, Some("Test Article".to_string()));
358        assert_eq!(visitor.categories, vec!["Category:Test"]);
359    }
360}