Skip to main content

scraper_trail/archive/
entry.rs

1use crate::{
2    archive::Archiveable,
3    exchange::Exchange,
4    request::{Request, params::Params},
5};
6use std::borrow::Cow;
7
8#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd, serde::Deserialize)]
9#[serde(field_identifier, rename_all = "lowercase")]
10pub enum Field {
11    Request,
12    Response,
13}
14
15pub struct Entry<'a, T: Archiveable> {
16    pub request_params: T::RequestParams,
17    pub exchange: Exchange<'a, T>,
18}
19
20impl<T: Archiveable + bounded_static::IntoBoundedStatic> bounded_static::IntoBoundedStatic
21    for Entry<'_, T>
22where
23    T::Static: Archiveable,
24    T::RequestParams: Into<<T::Static as Archiveable>::RequestParams>,
25{
26    type Static = Entry<'static, T::Static>;
27
28    fn into_static(self) -> Self::Static {
29        Self::Static {
30            request_params: self.request_params.into(),
31            exchange: self.exchange.into_static(),
32        }
33    }
34}
35
36impl<'a, 'de: 'a, T: Archiveable + 'a> serde::de::Deserialize<'de> for Entry<'a, T> {
37    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
38        struct EntryVisitor<'a, T>(std::marker::PhantomData<&'a T>);
39
40        impl<'a, 'de: 'a, T: Archiveable> serde::de::Visitor<'de> for EntryVisitor<'a, T> {
41            type Value = Entry<'a, T>;
42
43            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44                formatter.write_str("scraper exchange archive entry")
45            }
46
47            fn visit_map<A: serde::de::MapAccess<'de>>(
48                self,
49                mut map: A,
50            ) -> Result<Self::Value, A::Error> {
51                let request = map
52                    .next_entry::<Field, Request<'_>>()?
53                    .and_then(|(field, request)| {
54                        if field == Field::Request {
55                            Some(request)
56                        } else {
57                            None
58                        }
59                    })
60                    .ok_or_else(|| serde::de::Error::missing_field("request"))?;
61
62                let request_params = T::RequestParams::parse_request(&request)
63                    .map_err(|error| error.serde(&request))?;
64
65                let response = T::deserialize_response_field(&request_params, &mut map)?
66                    .and_then(|(field, data)| {
67                        if field == Field::Response {
68                            Some(data)
69                        } else {
70                            None
71                        }
72                    })
73                    .ok_or_else(|| serde::de::Error::missing_field("response"))?;
74
75                match map.next_entry::<Cow<'_, str>, serde::de::IgnoredAny>()? {
76                    Some((field, _)) => Err(serde::de::Error::unknown_field(
77                        &field,
78                        &["request", "response"],
79                    )),
80                    None => Ok(Entry {
81                        request_params,
82                        exchange: Exchange { request, response },
83                    }),
84                }
85            }
86        }
87
88        deserializer.deserialize_map(EntryVisitor(std::marker::PhantomData))
89    }
90}
91
92#[cfg(test)]
93mod tests {
94    use super::{Archiveable, Entry, Field};
95    use crate::exchange::Response;
96    use regex::Regex;
97    use std::borrow::Cow;
98    use std::sync::LazyLock;
99
100    const GOOGLE_PLAY_01_EXAMPLE: &str = include_str!("../../../examples/google-play-01.json");
101
102    #[test]
103    fn deserialize_google_archive() -> Result<(), Box<dyn std::error::Error>> {
104        let archive = serde_json::from_str::<Entry<'_, GoogleData>>(GOOGLE_PLAY_01_EXAMPLE)?;
105
106        assert_eq!(archive.request_params.pagination.country, "us");
107        assert_eq!(archive.request_params.review.app_id, "ai.chesslegends");
108        assert!(matches!(
109            archive.exchange.response.data,
110            GoogleData::Review(serde_json::Value::Array(_))
111        ));
112
113        Ok(())
114    }
115
116    struct ReviewRequest<'a> {
117        pagination: Pagination<'a>,
118        review: Review,
119    }
120
121    impl<'a> crate::request::params::Params for ReviewRequest<'a> {
122        fn build_request(
123            &self,
124            _timestamp: Option<chrono::DateTime<chrono::Utc>>,
125        ) -> crate::request::Request<'_> {
126            // Not tested here.
127            todo![]
128        }
129
130        fn parse_request(
131            request: &crate::request::Request<'_>,
132        ) -> Result<Self, crate::request::params::ParseError> {
133            let pagination = request.url.as_str().parse().map_err(|_| {
134                crate::request::params::ParseError::InvalidUrl {
135                    expected: "Google review pagination request",
136                }
137            })?;
138
139            let review = request
140                .body
141                .as_ref()
142                .and_then(|body| body.parse().ok())
143                .ok_or_else(|| crate::request::params::ParseError::InvalidBody {
144                    expected: "Google review pagination request",
145                })?;
146
147            Ok(Self { pagination, review })
148        }
149    }
150
151    enum GoogleData {
152        Review(serde_json::Value),
153    }
154
155    impl Archiveable for GoogleData {
156        type RequestParams = ReviewRequest<'static>;
157
158        fn deserialize_response_field<'de, A: serde::de::MapAccess<'de>>(
159            _request_params: &Self::RequestParams,
160            map: &mut A,
161        ) -> Result<Option<(Field, Response<'de, Self>)>, A::Error> {
162            Ok(map
163                .next_entry::<Field, Response<'_, serde_json::Value>>()?
164                .map(|(field, response)| (field, response.map(|value| GoogleData::Review(value)))))
165        }
166    }
167
168    #[derive(Clone, Debug, Eq, PartialEq)]
169    struct Pagination<'a> {
170        pub language: Cow<'a, str>,
171        pub country: Cow<'a, str>,
172    }
173
174    impl std::str::FromStr for Pagination<'static> {
175        type Err = String;
176
177        fn from_str(s: &str) -> Result<Self, Self::Err> {
178            static LANGUAGE_AND_COUNTRY_RE: LazyLock<Regex> =
179                LazyLock::new(|| Regex::new(r"hl=([a-z]{2}).*gl=([a-z]{2})").unwrap());
180
181            LANGUAGE_AND_COUNTRY_RE
182                .captures(s)
183                .and_then(|captures| captures.get(1).zip(captures.get(2)))
184                .map(|(language, country)| Self {
185                    language: language.as_str().to_string().into(),
186                    country: country.as_str().to_string().into(),
187                })
188                .ok_or_else(|| s.to_string())
189        }
190    }
191
192    #[derive(Clone, Debug, Eq, PartialEq)]
193    struct Review {
194        pub app_id: String,
195        pub sort_order: u8,
196        pub number: usize,
197        pub token: Option<String>,
198    }
199
200    impl std::str::FromStr for Review {
201        type Err = String;
202
203        fn from_str(s: &str) -> Result<Self, Self::Err> {
204            static REVIEW_RE: LazyLock<Regex> = LazyLock::new(|| {
205                Regex::new(r#"^f\.req=\[\[\["UsvDTd","\[null,null,\[2,(\d+),\[(\d+),null,([^\]]+)\],null,\[\]\],\[\\"([^\]]+)\\",7\]\]",null,"generic"\]\]\]$"#).unwrap()
206            });
207
208            let decoded = urlencoding::decode(s).map_err(|_| s.to_string())?;
209
210            REVIEW_RE
211                .captures(&decoded)
212                .and_then(|captures| {
213                    captures
214                        .get(1)
215                        .zip(captures.get(2))
216                        .zip(captures.get(3))
217                        .zip(captures.get(4))
218                        .and_then(
219                            |(((sort_order_match, number_match), token_match), app_id_match)| {
220                                sort_order_match
221                                    .as_str()
222                                    .parse::<u8>()
223                                    .ok()
224                                    .zip(number_match.as_str().parse::<usize>().ok())
225                                    .zip(match token_match.as_str() {
226                                        "null" => Some(None),
227                                        other
228                                            if other.starts_with(r#"\""#)
229                                                && other.ends_with(r#"\""#) =>
230                                        {
231                                            Some(Some(other[2..other.len() - 2].to_string()))
232                                        }
233                                        _ => None,
234                                    })
235                                    .map(|((sort_order, number), token)| Self {
236                                        app_id: app_id_match.as_str().to_string(),
237                                        sort_order,
238                                        number,
239                                        token,
240                                    })
241                            },
242                        )
243                })
244                .ok_or_else(|| s.to_string())
245        }
246    }
247}