progscrape_scrapers/types/
url.rs

1use std::{
2    collections::hash_map::DefaultHasher,
3    fmt::Display,
4    hash::{Hash, Hasher},
5};
6
7use serde::{Deserialize, Serialize};
8use url::Url;
9use urlnorm::UrlNormalizer;
10
11lazy_static::lazy_static! {
12    static ref URL_NORMALIZER: UrlNormalizer = UrlNormalizer::default();
13}
14
15/// Story-specific URL that caches the normalization information and other important parts of the URL.
16#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
17pub struct StoryUrl {
18    url: String,
19    host: String,
20    norm_str: StoryUrlNorm,
21}
22
23impl Serialize for StoryUrl {
24    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
25    where
26        S: serde::Serializer,
27    {
28        let tuple: (&String, &String, &String) = (&self.url, &self.host, &self.norm_str.norm);
29        tuple.serialize(serializer)
30    }
31}
32
33impl<'de> Deserialize<'de> for StoryUrl {
34    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
35    where
36        D: serde::Deserializer<'de>,
37    {
38        // The StoryUrl can be either a tuple with the underlying bits, or a raw URL that we need to parse
39        #[derive(Deserialize)]
40        #[serde(untagged)]
41        enum StoryUrlSerializationOptions {
42            Raw(String),
43            Bits((String, String, String)),
44        }
45
46        let res: Result<StoryUrlSerializationOptions, D::Error> =
47            Deserialize::deserialize(deserializer);
48        match res {
49            Ok(StoryUrlSerializationOptions::Raw(raw)) => StoryUrl::parse(&raw).ok_or(
50                serde::de::Error::custom(format!("Failed to parse URL '{raw}'")),
51            ),
52            Ok(StoryUrlSerializationOptions::Bits((url, host, norm))) => Ok(StoryUrl {
53                url,
54                host,
55                norm_str: StoryUrlNorm { norm },
56            }),
57            Err(e) => Err(e),
58        }
59    }
60}
61
62impl Display for StoryUrl {
63    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
64        self.url.fmt(f)
65    }
66}
67
68impl StoryUrl {
69    pub fn parse<S: AsRef<str>>(s: S) -> Option<Self> {
70        if let Ok(url) = Url::parse(s.as_ref()) {
71            if let Some(host) = URL_NORMALIZER.normalize_host(&url) {
72                let host = host.to_owned();
73                let norm_str = StoryUrlNorm {
74                    norm: URL_NORMALIZER.compute_normalization_string(&url),
75                };
76                let url = url.into();
77                return Some(Self {
78                    url,
79                    host,
80                    norm_str,
81                });
82            }
83        }
84        None
85    }
86
87    pub fn host(&self) -> &str {
88        &self.host
89    }
90
91    pub fn raw(&self) -> &str {
92        &self.url
93    }
94
95    pub fn normalization(&self) -> &StoryUrlNorm {
96        &self.norm_str
97    }
98}
99
100#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
101pub struct StoryUrlNorm {
102    norm: String,
103}
104
105impl Serialize for StoryUrlNorm {
106    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
107    where
108        S: serde::Serializer,
109    {
110        self.norm.serialize(serializer)
111    }
112}
113
114impl<'de> Deserialize<'de> for StoryUrlNorm {
115    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
116    where
117        D: serde::Deserializer<'de>,
118    {
119        let res: Result<String, _> = Deserialize::deserialize(deserializer);
120        res.map(|norm| StoryUrlNorm { norm })
121    }
122}
123
124impl StoryUrlNorm {
125    /// Re-create a story norm, if you know what you're doing.
126    pub fn from_string(norm: String) -> Self {
127        Self { norm }
128    }
129
130    pub fn hash(&self) -> i64 {
131        let mut hasher = DefaultHasher::new();
132        self.norm.hash(&mut hasher);
133
134        hasher.finish() as i64
135    }
136
137    pub fn string(&self) -> &str {
138        &self.norm
139    }
140}