1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
use crate::error::ExtrablattError;
use crate::language::Language;
use crate::{Article, ArticleStream, DefaultExtractor, Extractor};
use anyhow::Result;
use futures::Stream;
use std::borrow::Borrow;
use url::Url;
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct Category {
pub url: Url,
}
impl Category {
pub fn new(url: Url) -> Self {
Self { url }
}
pub fn language_hint(&self) -> Option<Language> {
for lang in Language::known_languages() {
let full_name = lang.full_name().to_lowercase();
let id = lang.identifier();
if let Some(domain) = &self.url.domain() {
if domain.ends_with(&format!(".{}", id))
|| domain.starts_with(&format!("{}.", id))
|| domain.starts_with(&format!("{}.", full_name))
{
return Some(lang.clone());
}
}
if let Some(mut seg) = self.url.path_segments() {
if seg.next().map(str::to_lowercase) == Some(full_name) {
return Some(lang.clone());
}
}
}
None
}
pub async fn into_stream(
self,
) -> Result<impl Stream<Item = std::result::Result<Article, ExtrablattError>>> {
Ok(self.into_stream_with_extractor(DefaultExtractor).await?)
}
pub async fn into_stream_with_extractor<TExtractor: Extractor + Unpin>(
self,
extractor: TExtractor,
) -> Result<impl Stream<Item = std::result::Result<Article, ExtrablattError>>> {
Ok(ArticleStream::new_with_extractor(self.url, extractor).await?)
}
}
impl Borrow<str> for Category {
fn borrow(&self) -> &str {
self.url.as_str()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn category_lang_hint() {
let category = Category::new(Url::parse("https://arabic.cnn.com/").unwrap());
assert_eq!(category.language_hint(), Some(Language::Arabic));
let category = Category::new(Url::parse("https://cnn.com/Arabic/").unwrap());
assert_eq!(category.language_hint(), Some(Language::Arabic));
let category = Category::new(Url::parse("https://cnn.com/Europe").unwrap());
assert_eq!(category.language_hint(), None);
}
}