Skip to main content

web_capture/
metadata.rs

1//! Article metadata extraction module (R1).
2//!
3//! Extracts metadata from web pages including:
4//! - Author information (name, URL, karma)
5//! - Publication date and modification date
6//! - Reading time and difficulty
7//! - Views, votes, bookmarks, comments
8//! - Hubs and tags (with URLs)
9//! - Translation information
10//! - LD+JSON structured data
11//!
12//! Based on reference implementation from:
13//! <https://github.com/link-foundation/meta-theory/blob/main/scripts/download-article.mjs>
14
15use scraper::{Html, Selector};
16use serde::{Deserialize, Serialize};
17
18/// Link with name and URL.
19#[derive(Debug, Clone, Default, Serialize, Deserialize)]
20pub struct NamedLink {
21    pub name: String,
22    #[serde(skip_serializing_if = "Option::is_none")]
23    pub url: Option<String>,
24}
25
26/// Extracted article metadata.
27#[derive(Debug, Clone, Default, Serialize, Deserialize)]
28#[serde(rename_all = "camelCase")]
29pub struct ArticleMetadata {
30    #[serde(skip_serializing_if = "Option::is_none")]
31    pub author: Option<String>,
32    #[serde(skip_serializing_if = "Option::is_none")]
33    pub author_url: Option<String>,
34    #[serde(skip_serializing_if = "Option::is_none")]
35    pub author_full_name: Option<String>,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub author_karma: Option<String>,
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub publish_date: Option<String>,
40    #[serde(skip_serializing_if = "Option::is_none")]
41    pub publish_date_text: Option<String>,
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub date_modified: Option<String>,
44    #[serde(skip_serializing_if = "Option::is_none")]
45    pub reading_time: Option<String>,
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub difficulty: Option<String>,
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub views: Option<String>,
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub votes: Option<String>,
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub comments: Option<String>,
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub bookmarks: Option<String>,
56    #[serde(skip_serializing_if = "Vec::is_empty", default)]
57    pub hubs: Vec<String>,
58    #[serde(skip_serializing_if = "Vec::is_empty", default)]
59    pub hub_urls: Vec<NamedLink>,
60    #[serde(skip_serializing_if = "Vec::is_empty", default)]
61    pub tags: Vec<String>,
62    #[serde(skip_serializing_if = "Vec::is_empty", default)]
63    pub tag_links: Vec<NamedLink>,
64    #[serde(skip_serializing_if = "Option::is_none")]
65    pub is_translation: Option<bool>,
66    #[serde(skip_serializing_if = "Option::is_none")]
67    pub translation_label: Option<String>,
68    #[serde(skip_serializing_if = "Option::is_none")]
69    pub original_article_url: Option<String>,
70    #[serde(skip_serializing_if = "Option::is_none")]
71    pub original_authors: Option<String>,
72    #[serde(skip_serializing_if = "Option::is_none")]
73    pub original_author_text: Option<String>,
74}
75
76/// Helper to select first element and get its text content.
77fn select_text(document: &Html, selector_str: &str) -> Option<String> {
78    let sel = Selector::parse(selector_str).ok()?;
79    let el = document.select(&sel).next()?;
80    let text: String = el.text().collect::<String>().trim().to_string();
81    if text.is_empty() {
82        None
83    } else {
84        Some(text)
85    }
86}
87
88/// Helper to select first element and get an attribute value.
89fn select_attr(document: &Html, selector_str: &str, attr: &str) -> Option<String> {
90    let sel = Selector::parse(selector_str).ok()?;
91    let el = document.select(&sel).next()?;
92    el.value().attr(attr).map(String::from)
93}
94
95/// Extract article metadata from HTML.
96///
97/// Works without a browser by parsing the HTML directly with scraper.
98#[must_use]
99#[allow(clippy::too_many_lines)]
100pub fn extract_metadata(html: &str) -> ArticleMetadata {
101    let document = Html::parse_document(html);
102    let mut meta = ArticleMetadata::default();
103
104    // Author
105    if let Some(author_text) = select_text(&document, ".tm-user-info__username") {
106        meta.author = Some(author_text);
107    }
108    if let Some(author_url) = select_attr(&document, ".tm-user-info__username", "href") {
109        meta.author_url = Some(author_url);
110    }
111
112    // Publication date
113    if let Some(datetime) = select_attr(&document, "time[datetime]", "datetime") {
114        meta.publish_date = Some(datetime);
115    }
116    if let Some(date_text) = select_text(&document, "time[datetime]") {
117        meta.publish_date_text = Some(date_text);
118    }
119
120    // Reading time
121    meta.reading_time = select_text(&document, ".tm-article-reading-time__label");
122
123    // Difficulty
124    meta.difficulty = select_text(&document, ".tm-article-complexity__label");
125
126    // Views
127    if let Ok(sel) = Selector::parse(".tm-icon-counter__value") {
128        if let Some(el) = document.select(&sel).next() {
129            if let Some(title) = el.value().attr("title") {
130                meta.views = Some(title.to_string());
131            } else {
132                let text: String = el.text().collect::<String>().trim().to_string();
133                if !text.is_empty() {
134                    meta.views = Some(text);
135                }
136            }
137        }
138    }
139
140    // Hubs
141    if let Ok(sel) = Selector::parse(".tm-publication-hub__link") {
142        let mut hubs = Vec::new();
143        let mut hub_urls = Vec::new();
144        for el in document.select(&sel) {
145            // Try to get name from first span child
146            let name = Selector::parse("span:first-child")
147                .ok()
148                .and_then(|span_sel| {
149                    el.select(&span_sel)
150                        .next()
151                        .map(|span| span.text().collect::<String>().trim().to_string())
152                        .filter(|s| !s.is_empty())
153                });
154            let name = name.unwrap_or_else(|| {
155                el.text()
156                    .collect::<String>()
157                    .trim()
158                    .trim_end_matches('*')
159                    .trim()
160                    .to_string()
161            });
162            let url = el.value().attr("href").map(String::from);
163            hubs.push(name.clone());
164            hub_urls.push(NamedLink { name, url });
165        }
166        if !hubs.is_empty() {
167            meta.hubs = hubs;
168            meta.hub_urls = hub_urls;
169        }
170    }
171
172    // Tags from meta keywords
173    if let Some(content) = select_attr(&document, r#"meta[name="keywords"]"#, "content") {
174        let tags: Vec<String> = content
175            .split(',')
176            .map(|t| t.trim().to_string())
177            .filter(|t| !t.is_empty())
178            .collect();
179        if !tags.is_empty() {
180            meta.tags = tags;
181        }
182    }
183
184    // Tags with URLs
185    if let Ok(sel) = Selector::parse(".tm-article-body__tags-item a, .tm-tags-list__link") {
186        let mut tag_links = Vec::new();
187        for el in document.select(&sel) {
188            let name = el.text().collect::<String>().trim().to_string();
189            let url = el.value().attr("href").map(String::from);
190            if !name.is_empty() {
191                tag_links.push(NamedLink { name, url });
192            }
193        }
194        if !tag_links.is_empty() {
195            meta.tag_links = tag_links;
196        }
197    }
198
199    // Translation badge
200    if let Some(text) = select_text(&document, ".tm-publication-label_variant-translation") {
201        meta.is_translation = Some(true);
202        meta.translation_label = Some(text);
203    }
204
205    // Original article link
206    if let Ok(sel) = Selector::parse(".tm-article-presenter__origin-link") {
207        if let Some(el) = document.select(&sel).next() {
208            meta.original_article_url = el.value().attr("href").map(String::from);
209            if let Ok(span_sel) = Selector::parse("span") {
210                if let Some(span) = el.select(&span_sel).next() {
211                    let text = span.text().collect::<String>().trim().to_string();
212                    if !text.is_empty() {
213                        meta.original_authors = Some(text);
214                    }
215                }
216            }
217            let full_text = el.text().collect::<String>().trim().to_string();
218            if !full_text.is_empty() {
219                meta.original_author_text = Some(full_text);
220            }
221        }
222    }
223
224    // LD+JSON structured data
225    if let Ok(sel) = Selector::parse(r#"script[type="application/ld+json"]"#) {
226        if let Some(el) = document.select(&sel).next() {
227            let json_text: String = el.text().collect();
228            if let Ok(value) = serde_json::from_str::<serde_json::Value>(&json_text) {
229                if let Some(modified) = value.get("dateModified").and_then(|v| v.as_str()) {
230                    meta.date_modified = Some(modified.to_string());
231                }
232                if let Some(author_name) = value
233                    .get("author")
234                    .and_then(|a| a.get("name"))
235                    .and_then(|n| n.as_str())
236                {
237                    meta.author_full_name = Some(author_name.to_string());
238                }
239            }
240        }
241    }
242
243    // Votes
244    meta.votes = select_text(&document, ".tm-votes-meter__value");
245
246    // Comments count
247    meta.comments = select_text(&document, ".tm-article-comments-counter-link__value");
248
249    // Bookmarks count
250    meta.bookmarks = select_text(&document, ".bookmarks-button__counter");
251
252    // Author karma
253    meta.author_karma = select_text(&document, ".tm-karma__votes");
254
255    meta
256}
257
258/// Format metadata as a markdown header block.
259///
260/// Placed after the title in the output markdown.
261#[must_use]
262pub fn format_metadata_block(metadata: &ArticleMetadata) -> Vec<String> {
263    let mut lines = Vec::new();
264
265    // Author line
266    if let Some(ref author) = metadata.author {
267        let author_name = metadata
268            .author_full_name
269            .as_ref()
270            .map_or_else(|| author.clone(), |full| format!("{full} ({author})"));
271        let author_link = metadata.author_url.as_ref().map_or_else(
272            || author_name.clone(),
273            |url| format!("[{author_name}]({url})"),
274        );
275        lines.push(format!("**Author:** {author_link}"));
276    }
277
278    // Translation
279    if metadata.is_translation == Some(true) {
280        let label = metadata
281            .translation_label
282            .as_deref()
283            .unwrap_or("Translation");
284        lines.push(format!("**Type:** {label}"));
285    }
286
287    // Original article
288    if let Some(ref authors) = metadata.original_authors {
289        if let Some(ref url) = metadata.original_article_url {
290            lines.push(format!("**Original article:** [{authors}]({url})"));
291        } else {
292            lines.push(format!("**Original authors:** {authors}"));
293        }
294    }
295
296    // Publication date
297    if let Some(ref date) = metadata.publish_date {
298        let mut date_line = format!("**Published:** {date}");
299        if let Some(ref modified) = metadata.date_modified {
300            if modified != date {
301                date_line.push_str(" (updated ");
302                date_line.push_str(modified);
303                date_line.push(')');
304            }
305        }
306        lines.push(date_line);
307    }
308
309    // Info items
310    let mut info_items = Vec::new();
311    if let Some(ref rt) = metadata.reading_time {
312        info_items.push(format!("Reading time: {rt}"));
313    }
314    if let Some(ref diff) = metadata.difficulty {
315        info_items.push(format!("Difficulty: {diff}"));
316    }
317    if let Some(ref views) = metadata.views {
318        info_items.push(format!("Views: {views}"));
319    }
320    if !info_items.is_empty() {
321        lines.push(format!("**{}**", info_items.join(" | ")));
322    }
323
324    // Hubs
325    if !metadata.hubs.is_empty() {
326        lines.push(format!("**Hubs:** {}", metadata.hubs.join(", ")));
327    }
328
329    // Tags
330    if !metadata.tags.is_empty() {
331        lines.push(format!("**Tags:** {}", metadata.tags.join(", ")));
332    }
333
334    lines
335}
336
337/// Format footer metadata block.
338///
339/// Placed at the end of the article, matching Habr article footer.
340#[must_use]
341pub fn format_footer_block(metadata: &ArticleMetadata) -> Vec<String> {
342    let mut lines = Vec::new();
343    lines.push("---".to_string());
344    lines.push(String::new());
345
346    // Tags with links
347    if !metadata.tag_links.is_empty() {
348        let tag_strings: Vec<String> = metadata
349            .tag_links
350            .iter()
351            .map(|t| {
352                t.url
353                    .as_ref()
354                    .map_or_else(|| t.name.clone(), |url| format!("[{}]({})", t.name, url))
355            })
356            .collect();
357        lines.push(format!("**Tags:** {}", tag_strings.join(", ")));
358        lines.push(String::new());
359    } else if !metadata.tags.is_empty() {
360        lines.push(format!("**Tags:** {}", metadata.tags.join(", ")));
361        lines.push(String::new());
362    }
363
364    // Hubs with links
365    if !metadata.hub_urls.is_empty() {
366        let hub_strings: Vec<String> = metadata
367            .hub_urls
368            .iter()
369            .map(|h| {
370                h.url
371                    .as_ref()
372                    .map_or_else(|| h.name.clone(), |url| format!("[{}]({})", h.name, url))
373            })
374            .collect();
375        lines.push(format!("**Hubs:** {}", hub_strings.join(", ")));
376        lines.push(String::new());
377    } else if !metadata.hubs.is_empty() {
378        lines.push(format!("**Hubs:** {}", metadata.hubs.join(", ")));
379        lines.push(String::new());
380    }
381
382    // Stats
383    let mut stats = Vec::new();
384    if let Some(ref votes) = metadata.votes {
385        stats.push(format!("Votes: {votes}"));
386    }
387    if let Some(ref views) = metadata.views {
388        stats.push(format!("Views: {views}"));
389    }
390    if let Some(ref bookmarks) = metadata.bookmarks {
391        stats.push(format!("Bookmarks: {bookmarks}"));
392    }
393    if let Some(ref comments) = metadata.comments {
394        stats.push(format!("Comments: {comments}"));
395    }
396    if !stats.is_empty() {
397        lines.push(format!("**{}**", stats.join(" | ")));
398        lines.push(String::new());
399    }
400
401    // Author info
402    if let Some(ref author) = metadata.author {
403        let author_name = metadata
404            .author_full_name
405            .as_ref()
406            .map_or_else(|| author.clone(), |full| format!("{full} ({author})"));
407        let author_link = metadata.author_url.as_ref().map_or_else(
408            || author_name.clone(),
409            |url| format!("[{author_name}]({url})"),
410        );
411        let mut author_entry = format!("**Author:** {author_link}");
412        if let Some(ref karma) = metadata.author_karma {
413            author_entry.push_str(" | Karma: ");
414            author_entry.push_str(karma);
415        }
416        lines.push(author_entry);
417        lines.push(String::new());
418    }
419
420    lines
421}