wme_stream/visitor.rs
1//! Visitor trait for article processing.
2//!
3//! The visitor pattern allows processing articles incrementally without
4//! fully materializing them in memory. This is useful for:
5//!
6//! - **Graph building** - Extract nodes and edges without storing articles
7//! - **Statistics** - Count occurrences without keeping articles
8//! - **Filtering** - Selectively process specific fields
9//! - **Large datasets** - Process snapshots larger than available RAM
10//!
11//! # How It Works
12//!
13//! The visitor is called for each component of an article:
14//! 1. `visit_article_start()` - New article begins
15//! 2. `visit_category()`, `visit_link()`, etc. - Components discovered
16//! 3. `visit_article_end()` - Article complete
17//!
18//! # Implementing a Visitor
19//!
20//! ```rust
21//! use wme_stream::ArticleVisitor;
22//! use serde_json::Value;
23//!
24//! struct MyVisitor {
25//! article_count: u64,
26//! category_count: u64,
27//! }
28//!
29//! impl ArticleVisitor for MyVisitor {
30//! fn visit_article_start(&mut self, _id: u64, _name: &str) {
31//! self.article_count += 1;
32//! }
33//!
34//! fn visit_category(&mut self, _name: &str, _url: &str) {
35//! self.category_count += 1;
36//! }
37//!
38//! fn visit_link(&mut self, _text: &str, _url: &str) {}
39//! fn visit_infobox(&mut self, _name: &str, _value: &str) {}
40//! fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
41//! fn visit_article_end(&mut self) {}
42//! }
43//! ```
44//!
45//! # Included Visitors
46//!
47//! - `NoOpVisitor` - Does nothing (useful for testing)
48//! - `StatsVisitor` - Counts articles, categories, links, etc.
49
50use serde_json::Value;
51
52/// Visitor trait for extracting data without full article materialization.
53///
54/// This trait allows incremental processing of articles, enabling
55/// graph building, statistics collection, and other use cases without
56/// keeping full articles in memory.
57///
58/// # Usage
59///
60/// Implement this trait to process specific parts of articles:
61/// - Override methods for components you care about
62/// - Leave others as no-ops
63/// - Store state in your struct fields
64///
65/// # Example
66///
67/// ```rust
68/// use wme_stream::ArticleVisitor;
69/// use serde_json::Value;
70///
71/// struct CategoryCollector {
72/// categories: Vec<String>,
73/// }
74///
75/// impl ArticleVisitor for CategoryCollector {
76/// fn visit_article_start(&mut self, _id: u64, _name: &str) {}
77///
78/// fn visit_category(&mut self, name: &str, _url: &str) {
79/// self.categories.push(name.to_string());
80/// }
81///
82/// fn visit_link(&mut self, _text: &str, _url: &str) {}
83/// fn visit_infobox(&mut self, _name: &str, _value: &str) {}
84/// fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
85/// fn visit_article_end(&mut self) {}
86/// }
87/// ```
88pub trait ArticleVisitor {
89 /// Called when starting to process an article.
90 ///
91 /// # Arguments
92 ///
93 /// * `id` - Article identifier
94 /// * `name` - Article title
95 fn visit_article_start(&mut self, id: u64, name: &str);
96
97 /// Called when encountering a category.
98 ///
99 /// # Arguments
100 ///
101 /// * `name` - Category name (e.g., "Category:Living people")
102 /// * `url` - Category URL
103 fn visit_category(&mut self, name: &str, url: &str);
104
105 /// Called when encountering a link.
106 ///
107 /// # Arguments
108 ///
109 /// * `text` - Link text
110 /// * `url` - Link URL
111 fn visit_link(&mut self, text: &str, url: &str);
112
113 /// Called when encountering an infobox field.
114 ///
115 /// # Arguments
116 ///
117 /// * `name` - Field name
118 /// * `value` - Field value
119 fn visit_infobox(&mut self, name: &str, value: &str);
120
121 /// Called when encountering a reference.
122 ///
123 /// # Arguments
124 ///
125 /// * `id` - Reference identifier
126 /// * `ref_type` - Reference type ("web", "book", "text")
127 /// * `metadata` - Reference metadata (author, title, URL, etc.)
128 fn visit_reference(&mut self, id: &str, ref_type: &str, metadata: &Value);
129
130 /// Called when finished processing an article.
131 ///
132 /// Use this to finalize per-article processing.
133 fn visit_article_end(&mut self);
134}
135
136/// A no-op visitor that does nothing.
137///
138/// Useful for:
139/// - Testing
140/// - Benchmarking
141/// - As a base for selective implementation
142///
143/// # Example
144///
145/// ```rust
146/// use wme_stream::NoOpVisitor;
147///
148/// let mut visitor = NoOpVisitor;
149/// // All methods are no-ops
150/// ```
151pub struct NoOpVisitor;
152
153impl ArticleVisitor for NoOpVisitor {
154 fn visit_article_start(&mut self, _id: u64, _name: &str) {}
155 fn visit_category(&mut self, _name: &str, _url: &str) {}
156 fn visit_link(&mut self, _text: &str, _url: &str) {}
157 fn visit_infobox(&mut self, _name: &str, _value: &str) {}
158 fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
159 fn visit_article_end(&mut self) {}
160}
161
162/// A visitor that collects counts only (low memory footprint).
163///
164/// Tracks aggregate statistics without storing individual articles.
165/// Useful for getting overview statistics of a snapshot.
166///
167/// # Example
168///
169/// ```rust
170/// use wme_stream::{ArticleVisitor, StatsVisitor};
171/// use serde_json::Value;
172///
173/// let mut visitor = StatsVisitor::new();
174///
175/// // Simulate visiting articles
176/// for i in 0..100 {
177/// visitor.visit_article_start(i as u64, "Article");
178/// visitor.visit_category("Category:Test", "https://en.wikipedia.org/wiki/Category:Test");
179/// visitor.visit_link("Link", "https://en.wikipedia.org/wiki/Link");
180/// visitor.visit_article_end();
181/// }
182///
183/// assert_eq!(visitor.article_count, 100);
184/// assert_eq!(visitor.category_count, 100);
185/// assert_eq!(visitor.link_count, 100);
186/// ```
187pub struct StatsVisitor {
188 /// Total articles processed
189 pub article_count: u64,
190 /// Total categories found
191 pub category_count: u64,
192 /// Total links found
193 pub link_count: u64,
194 /// Total infoboxes found
195 pub infobox_count: u64,
196 /// Total references found
197 pub reference_count: u64,
198}
199
200impl StatsVisitor {
201 /// Create a new stats visitor with all counts at zero.
202 ///
203 /// # Example
204 ///
205 /// ```rust
206 /// use wme_stream::StatsVisitor;
207 ///
208 /// let visitor = StatsVisitor::new();
209 /// assert_eq!(visitor.article_count, 0);
210 /// assert_eq!(visitor.category_count, 0);
211 /// ```
212 pub fn new() -> Self {
213 Self {
214 article_count: 0,
215 category_count: 0,
216 link_count: 0,
217 infobox_count: 0,
218 reference_count: 0,
219 }
220 }
221}
222
223impl ArticleVisitor for StatsVisitor {
224 fn visit_article_start(&mut self, _id: u64, _name: &str) {
225 self.article_count += 1;
226 }
227
228 fn visit_category(&mut self, _name: &str, _url: &str) {
229 self.category_count += 1;
230 }
231
232 fn visit_link(&mut self, _text: &str, _url: &str) {
233 self.link_count += 1;
234 }
235
236 fn visit_infobox(&mut self, _name: &str, _value: &str) {
237 self.infobox_count += 1;
238 }
239
240 fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {
241 self.reference_count += 1;
242 }
243
244 fn visit_article_end(&mut self) {}
245}
246
247impl Default for StatsVisitor {
248 fn default() -> Self {
249 Self::new()
250 }
251}
252
253#[cfg(test)]
254mod tests {
255 use super::*;
256
257 #[test]
258 fn test_noop_visitor() {
259 let mut visitor = NoOpVisitor;
260
261 // Should not panic
262 visitor.visit_article_start(1, "Test");
263 visitor.visit_category("Cat", "https://example.com");
264 visitor.visit_link("Link", "https://example.com");
265 visitor.visit_infobox("Name", "Value");
266 visitor.visit_reference("ref1", "web", &Value::Null);
267 visitor.visit_article_end();
268 }
269
270 #[test]
271 fn test_stats_visitor_new() {
272 let visitor = StatsVisitor::new();
273 assert_eq!(visitor.article_count, 0);
274 assert_eq!(visitor.category_count, 0);
275 assert_eq!(visitor.link_count, 0);
276 assert_eq!(visitor.infobox_count, 0);
277 assert_eq!(visitor.reference_count, 0);
278 }
279
280 #[test]
281 fn test_stats_visitor_counts() {
282 let mut visitor = StatsVisitor::new();
283
284 // Simulate visiting articles
285 for i in 0..10 {
286 visitor.visit_article_start(i, &format!("Article {}", i));
287
288 // Each article has 2 categories
289 visitor.visit_category("Category:A", "https://example.com/A");
290 visitor.visit_category("Category:B", "https://example.com/B");
291
292 // Each article has 3 links
293 visitor.visit_link("Link1", "https://example.com/1");
294 visitor.visit_link("Link2", "https://example.com/2");
295 visitor.visit_link("Link3", "https://example.com/3");
296
297 // Each article has 1 infobox
298 visitor.visit_infobox("Name", "Value");
299
300 // Each article has 2 references
301 visitor.visit_reference(&format!("ref{}", i * 2), "web", &Value::Null);
302 visitor.visit_reference(&format!("ref{}", i * 2 + 1), "book", &Value::Null);
303
304 visitor.visit_article_end();
305 }
306
307 assert_eq!(visitor.article_count, 10);
308 assert_eq!(visitor.category_count, 20); // 10 * 2
309 assert_eq!(visitor.link_count, 30); // 10 * 3
310 assert_eq!(visitor.infobox_count, 10); // 10 * 1
311 assert_eq!(visitor.reference_count, 20); // 10 * 2
312 }
313
314 #[test]
315 fn test_stats_visitor_default() {
316 let visitor: StatsVisitor = Default::default();
317 assert_eq!(visitor.article_count, 0);
318 assert_eq!(visitor.category_count, 0);
319 }
320
321 // Test a custom visitor implementation
322 struct TestVisitor {
323 last_article_id: Option<u64>,
324 last_article_name: Option<String>,
325 categories: Vec<String>,
326 }
327
328 impl ArticleVisitor for TestVisitor {
329 fn visit_article_start(&mut self, id: u64, name: &str) {
330 self.last_article_id = Some(id);
331 self.last_article_name = Some(name.to_string());
332 }
333
334 fn visit_category(&mut self, name: &str, _url: &str) {
335 self.categories.push(name.to_string());
336 }
337
338 fn visit_link(&mut self, _text: &str, _url: &str) {}
339 fn visit_infobox(&mut self, _name: &str, _value: &str) {}
340 fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
341 fn visit_article_end(&mut self) {}
342 }
343
344 #[test]
345 fn test_custom_visitor() {
346 let mut visitor = TestVisitor {
347 last_article_id: None,
348 last_article_name: None,
349 categories: Vec::new(),
350 };
351
352 visitor.visit_article_start(42, "Test Article");
353 visitor.visit_category("Category:Test", "https://example.com");
354 visitor.visit_article_end();
355
356 assert_eq!(visitor.last_article_id, Some(42));
357 assert_eq!(visitor.last_article_name, Some("Test Article".to_string()));
358 assert_eq!(visitor.categories, vec!["Category:Test"]);
359 }
360}