1use scraper::{Html, Selector};
16use serde::{Deserialize, Serialize};
17
18#[derive(Debug, Clone, Default, Serialize, Deserialize)]
20pub struct NamedLink {
21 pub name: String,
22 #[serde(skip_serializing_if = "Option::is_none")]
23 pub url: Option<String>,
24}
25
26#[derive(Debug, Clone, Default, Serialize, Deserialize)]
28#[serde(rename_all = "camelCase")]
29pub struct ArticleMetadata {
30 #[serde(skip_serializing_if = "Option::is_none")]
31 pub author: Option<String>,
32 #[serde(skip_serializing_if = "Option::is_none")]
33 pub author_url: Option<String>,
34 #[serde(skip_serializing_if = "Option::is_none")]
35 pub author_full_name: Option<String>,
36 #[serde(skip_serializing_if = "Option::is_none")]
37 pub author_karma: Option<String>,
38 #[serde(skip_serializing_if = "Option::is_none")]
39 pub publish_date: Option<String>,
40 #[serde(skip_serializing_if = "Option::is_none")]
41 pub publish_date_text: Option<String>,
42 #[serde(skip_serializing_if = "Option::is_none")]
43 pub date_modified: Option<String>,
44 #[serde(skip_serializing_if = "Option::is_none")]
45 pub reading_time: Option<String>,
46 #[serde(skip_serializing_if = "Option::is_none")]
47 pub difficulty: Option<String>,
48 #[serde(skip_serializing_if = "Option::is_none")]
49 pub views: Option<String>,
50 #[serde(skip_serializing_if = "Option::is_none")]
51 pub votes: Option<String>,
52 #[serde(skip_serializing_if = "Option::is_none")]
53 pub comments: Option<String>,
54 #[serde(skip_serializing_if = "Option::is_none")]
55 pub bookmarks: Option<String>,
56 #[serde(skip_serializing_if = "Vec::is_empty", default)]
57 pub hubs: Vec<String>,
58 #[serde(skip_serializing_if = "Vec::is_empty", default)]
59 pub hub_urls: Vec<NamedLink>,
60 #[serde(skip_serializing_if = "Vec::is_empty", default)]
61 pub tags: Vec<String>,
62 #[serde(skip_serializing_if = "Vec::is_empty", default)]
63 pub tag_links: Vec<NamedLink>,
64 #[serde(skip_serializing_if = "Option::is_none")]
65 pub is_translation: Option<bool>,
66 #[serde(skip_serializing_if = "Option::is_none")]
67 pub translation_label: Option<String>,
68 #[serde(skip_serializing_if = "Option::is_none")]
69 pub original_article_url: Option<String>,
70 #[serde(skip_serializing_if = "Option::is_none")]
71 pub original_authors: Option<String>,
72 #[serde(skip_serializing_if = "Option::is_none")]
73 pub original_author_text: Option<String>,
74}
75
76fn select_text(document: &Html, selector_str: &str) -> Option<String> {
78 let sel = Selector::parse(selector_str).ok()?;
79 let el = document.select(&sel).next()?;
80 let text: String = el.text().collect::<String>().trim().to_string();
81 if text.is_empty() {
82 None
83 } else {
84 Some(text)
85 }
86}
87
88fn select_attr(document: &Html, selector_str: &str, attr: &str) -> Option<String> {
90 let sel = Selector::parse(selector_str).ok()?;
91 let el = document.select(&sel).next()?;
92 el.value().attr(attr).map(String::from)
93}
94
95#[must_use]
99#[allow(clippy::too_many_lines)]
100pub fn extract_metadata(html: &str) -> ArticleMetadata {
101 let document = Html::parse_document(html);
102 let mut meta = ArticleMetadata::default();
103
104 if let Some(author_text) = select_text(&document, ".tm-user-info__username") {
106 meta.author = Some(author_text);
107 }
108 if let Some(author_url) = select_attr(&document, ".tm-user-info__username", "href") {
109 meta.author_url = Some(author_url);
110 }
111
112 if let Some(datetime) = select_attr(&document, "time[datetime]", "datetime") {
114 meta.publish_date = Some(datetime);
115 }
116 if let Some(date_text) = select_text(&document, "time[datetime]") {
117 meta.publish_date_text = Some(date_text);
118 }
119
120 meta.reading_time = select_text(&document, ".tm-article-reading-time__label");
122
123 meta.difficulty = select_text(&document, ".tm-article-complexity__label");
125
126 if let Ok(sel) = Selector::parse(".tm-icon-counter__value") {
128 if let Some(el) = document.select(&sel).next() {
129 if let Some(title) = el.value().attr("title") {
130 meta.views = Some(title.to_string());
131 } else {
132 let text: String = el.text().collect::<String>().trim().to_string();
133 if !text.is_empty() {
134 meta.views = Some(text);
135 }
136 }
137 }
138 }
139
140 if let Ok(sel) = Selector::parse(".tm-publication-hub__link") {
142 let mut hubs = Vec::new();
143 let mut hub_urls = Vec::new();
144 for el in document.select(&sel) {
145 let name = Selector::parse("span:first-child")
147 .ok()
148 .and_then(|span_sel| {
149 el.select(&span_sel)
150 .next()
151 .map(|span| span.text().collect::<String>().trim().to_string())
152 .filter(|s| !s.is_empty())
153 });
154 let name = name.unwrap_or_else(|| {
155 el.text()
156 .collect::<String>()
157 .trim()
158 .trim_end_matches('*')
159 .trim()
160 .to_string()
161 });
162 let url = el.value().attr("href").map(String::from);
163 hubs.push(name.clone());
164 hub_urls.push(NamedLink { name, url });
165 }
166 if !hubs.is_empty() {
167 meta.hubs = hubs;
168 meta.hub_urls = hub_urls;
169 }
170 }
171
172 if let Some(content) = select_attr(&document, r#"meta[name="keywords"]"#, "content") {
174 let tags: Vec<String> = content
175 .split(',')
176 .map(|t| t.trim().to_string())
177 .filter(|t| !t.is_empty())
178 .collect();
179 if !tags.is_empty() {
180 meta.tags = tags;
181 }
182 }
183
184 if let Ok(sel) = Selector::parse(".tm-article-body__tags-item a, .tm-tags-list__link") {
186 let mut tag_links = Vec::new();
187 for el in document.select(&sel) {
188 let name = el.text().collect::<String>().trim().to_string();
189 let url = el.value().attr("href").map(String::from);
190 if !name.is_empty() {
191 tag_links.push(NamedLink { name, url });
192 }
193 }
194 if !tag_links.is_empty() {
195 meta.tag_links = tag_links;
196 }
197 }
198
199 if let Some(text) = select_text(&document, ".tm-publication-label_variant-translation") {
201 meta.is_translation = Some(true);
202 meta.translation_label = Some(text);
203 }
204
205 if let Ok(sel) = Selector::parse(".tm-article-presenter__origin-link") {
207 if let Some(el) = document.select(&sel).next() {
208 meta.original_article_url = el.value().attr("href").map(String::from);
209 if let Ok(span_sel) = Selector::parse("span") {
210 if let Some(span) = el.select(&span_sel).next() {
211 let text = span.text().collect::<String>().trim().to_string();
212 if !text.is_empty() {
213 meta.original_authors = Some(text);
214 }
215 }
216 }
217 let full_text = el.text().collect::<String>().trim().to_string();
218 if !full_text.is_empty() {
219 meta.original_author_text = Some(full_text);
220 }
221 }
222 }
223
224 if let Ok(sel) = Selector::parse(r#"script[type="application/ld+json"]"#) {
226 if let Some(el) = document.select(&sel).next() {
227 let json_text: String = el.text().collect();
228 if let Ok(value) = serde_json::from_str::<serde_json::Value>(&json_text) {
229 if let Some(modified) = value.get("dateModified").and_then(|v| v.as_str()) {
230 meta.date_modified = Some(modified.to_string());
231 }
232 if let Some(author_name) = value
233 .get("author")
234 .and_then(|a| a.get("name"))
235 .and_then(|n| n.as_str())
236 {
237 meta.author_full_name = Some(author_name.to_string());
238 }
239 }
240 }
241 }
242
243 meta.votes = select_text(&document, ".tm-votes-meter__value");
245
246 meta.comments = select_text(&document, ".tm-article-comments-counter-link__value");
248
249 meta.bookmarks = select_text(&document, ".bookmarks-button__counter");
251
252 meta.author_karma = select_text(&document, ".tm-karma__votes");
254
255 meta
256}
257
258#[must_use]
262pub fn format_metadata_block(metadata: &ArticleMetadata) -> Vec<String> {
263 let mut lines = Vec::new();
264
265 if let Some(ref author) = metadata.author {
267 let author_name = metadata
268 .author_full_name
269 .as_ref()
270 .map_or_else(|| author.clone(), |full| format!("{full} ({author})"));
271 let author_link = metadata.author_url.as_ref().map_or_else(
272 || author_name.clone(),
273 |url| format!("[{author_name}]({url})"),
274 );
275 lines.push(format!("**Author:** {author_link}"));
276 }
277
278 if metadata.is_translation == Some(true) {
280 let label = metadata
281 .translation_label
282 .as_deref()
283 .unwrap_or("Translation");
284 lines.push(format!("**Type:** {label}"));
285 }
286
287 if let Some(ref authors) = metadata.original_authors {
289 if let Some(ref url) = metadata.original_article_url {
290 lines.push(format!("**Original article:** [{authors}]({url})"));
291 } else {
292 lines.push(format!("**Original authors:** {authors}"));
293 }
294 }
295
296 if let Some(ref date) = metadata.publish_date {
298 let mut date_line = format!("**Published:** {date}");
299 if let Some(ref modified) = metadata.date_modified {
300 if modified != date {
301 date_line.push_str(" (updated ");
302 date_line.push_str(modified);
303 date_line.push(')');
304 }
305 }
306 lines.push(date_line);
307 }
308
309 let mut info_items = Vec::new();
311 if let Some(ref rt) = metadata.reading_time {
312 info_items.push(format!("Reading time: {rt}"));
313 }
314 if let Some(ref diff) = metadata.difficulty {
315 info_items.push(format!("Difficulty: {diff}"));
316 }
317 if let Some(ref views) = metadata.views {
318 info_items.push(format!("Views: {views}"));
319 }
320 if !info_items.is_empty() {
321 lines.push(format!("**{}**", info_items.join(" | ")));
322 }
323
324 if !metadata.hubs.is_empty() {
326 lines.push(format!("**Hubs:** {}", metadata.hubs.join(", ")));
327 }
328
329 if !metadata.tags.is_empty() {
331 lines.push(format!("**Tags:** {}", metadata.tags.join(", ")));
332 }
333
334 lines
335}
336
337#[must_use]
341pub fn format_footer_block(metadata: &ArticleMetadata) -> Vec<String> {
342 let mut lines = Vec::new();
343 lines.push("---".to_string());
344 lines.push(String::new());
345
346 if !metadata.tag_links.is_empty() {
348 let tag_strings: Vec<String> = metadata
349 .tag_links
350 .iter()
351 .map(|t| {
352 t.url
353 .as_ref()
354 .map_or_else(|| t.name.clone(), |url| format!("[{}]({})", t.name, url))
355 })
356 .collect();
357 lines.push(format!("**Tags:** {}", tag_strings.join(", ")));
358 lines.push(String::new());
359 } else if !metadata.tags.is_empty() {
360 lines.push(format!("**Tags:** {}", metadata.tags.join(", ")));
361 lines.push(String::new());
362 }
363
364 if !metadata.hub_urls.is_empty() {
366 let hub_strings: Vec<String> = metadata
367 .hub_urls
368 .iter()
369 .map(|h| {
370 h.url
371 .as_ref()
372 .map_or_else(|| h.name.clone(), |url| format!("[{}]({})", h.name, url))
373 })
374 .collect();
375 lines.push(format!("**Hubs:** {}", hub_strings.join(", ")));
376 lines.push(String::new());
377 } else if !metadata.hubs.is_empty() {
378 lines.push(format!("**Hubs:** {}", metadata.hubs.join(", ")));
379 lines.push(String::new());
380 }
381
382 let mut stats = Vec::new();
384 if let Some(ref votes) = metadata.votes {
385 stats.push(format!("Votes: {votes}"));
386 }
387 if let Some(ref views) = metadata.views {
388 stats.push(format!("Views: {views}"));
389 }
390 if let Some(ref bookmarks) = metadata.bookmarks {
391 stats.push(format!("Bookmarks: {bookmarks}"));
392 }
393 if let Some(ref comments) = metadata.comments {
394 stats.push(format!("Comments: {comments}"));
395 }
396 if !stats.is_empty() {
397 lines.push(format!("**{}**", stats.join(" | ")));
398 lines.push(String::new());
399 }
400
401 if let Some(ref author) = metadata.author {
403 let author_name = metadata
404 .author_full_name
405 .as_ref()
406 .map_or_else(|| author.clone(), |full| format!("{full} ({author})"));
407 let author_link = metadata.author_url.as_ref().map_or_else(
408 || author_name.clone(),
409 |url| format!("[{author_name}]({url})"),
410 );
411 let mut author_entry = format!("**Author:** {author_link}");
412 if let Some(ref karma) = metadata.author_karma {
413 author_entry.push_str(" | Karma: ");
414 author_entry.push_str(karma);
415 }
416 lines.push(author_entry);
417 lines.push(String::new());
418 }
419
420 lines
421}