libnetrunner/parser/
mod.rs

1use blake2::{Blake2s256, Digest};
2use flate2::read::GzDecoder;
3use serde::{Deserialize, Serialize};
4use std::collections::{HashMap, HashSet};
5use std::fs::File;
6use std::io::{BufRead, BufReader};
7use std::path::Path;
8
9pub mod html;
10
11#[derive(Debug, Default, Clone, Serialize, Deserialize)]
12pub struct ParseResult {
13    /// Index should use this URL instead of the one that lead to the content.
14    pub canonical_url: Option<String>,
15    /// Text content from page after stripping HTML tags & any semantically
16    /// unimportant sections (header/footer/etc.)
17    pub content: String,
18    /// Used to determine whether document content has changed.
19    pub content_hash: String,
20    /// Page description, extracted from meta tags or summarized from the actual content
21    pub description: String,
22    /// Links found in the page.
23    #[serde(skip)]
24    pub links: HashSet<String>,
25    /// Meta (OpenGraph, etc) tags associated w/ this content.
26    pub meta: HashMap<String, String>,
27    /// Title of the page, document, etc.
28    pub title: Option<String>,
29}
30
31impl ParseResult {
32    pub fn builder() -> ParseResultBuilder {
33        ParseResultBuilder::new()
34    }
35
36    pub fn iter_from_gz(file: &Path) -> anyhow::Result<ParseResultGzIterator> {
37        let file_name = file
38            .file_name()
39            .map(|f| f.to_string_lossy())
40            .unwrap_or_default();
41        let file_format = if file_name.contains(".jsonl.gz") {
42            ParseResultFormat::Json
43        } else {
44            ParseResultFormat::Ron
45        };
46
47        let file = File::open(file)?;
48        Ok(ParseResultGzIterator::new(
49            file_format,
50            BufReader::new(GzDecoder::new(file)),
51        ))
52    }
53}
54
55#[derive(Debug)]
56pub enum ParseResultFormat {
57    Json,
58    Ron,
59}
60
61type GzBufReader = BufReader<GzDecoder<File>>;
62pub struct ParseResultGzIterator {
63    file_format: ParseResultFormat,
64    reader: GzBufReader,
65    buffer: String,
66}
67
68/// Utility iterator that reads in lines from a gzipped archive of serialized
69/// ParseResults
70impl ParseResultGzIterator {
71    pub fn new(file_format: ParseResultFormat, reader: GzBufReader) -> Self {
72        Self {
73            file_format,
74            reader,
75            buffer: String::new(),
76        }
77    }
78}
79
80impl Iterator for ParseResultGzIterator {
81    type Item = ParseResult;
82    fn next(&mut self) -> Option<Self::Item> {
83        self.buffer.clear();
84        if let Ok(read) = self.reader.read_line(&mut self.buffer) {
85            if read == 0 {
86                return None;
87            }
88
89            match self.file_format {
90                ParseResultFormat::Json => {
91                    if let Ok(res) = serde_json::de::from_str::<ParseResult>(&self.buffer) {
92                        return Some(res);
93                    }
94                }
95                ParseResultFormat::Ron => {
96                    if let Ok(res) = ron::de::from_str::<ParseResult>(&self.buffer) {
97                        return Some(res);
98                    }
99                }
100            }
101        }
102
103        None
104    }
105}
106
107impl Default for ParseResultBuilder {
108    fn default() -> Self {
109        Self::new()
110    }
111}
112
113pub struct ParseResultBuilder {
114    result: ParseResult,
115}
116
117impl ParseResultBuilder {
118    pub fn build(self) -> ParseResult {
119        self.result
120    }
121
122    pub fn new() -> Self {
123        ParseResultBuilder {
124            result: ParseResult::default(),
125        }
126    }
127
128    pub fn canonical_url(mut self, url: Option<String>) -> Self {
129        self.result.canonical_url = url;
130        self
131    }
132
133    pub fn content(mut self, content: String) -> Self {
134        let mut hasher = Blake2s256::new();
135        hasher.update(content.clone());
136        let res = hasher.finalize();
137
138        self.result.content = content;
139        self.result.content_hash = hex::encode(res);
140
141        self
142    }
143
144    pub fn description(mut self, desc: String) -> Self {
145        self.result.description = desc;
146        self
147    }
148
149    pub fn links(mut self, links: HashSet<String>) -> Self {
150        self.result.links = links;
151        self
152    }
153
154    pub fn meta(mut self, meta: HashMap<String, String>) -> Self {
155        self.result.meta = meta;
156        self
157    }
158
159    pub fn title(mut self, title: Option<String>) -> Self {
160        self.result.title = title;
161        self
162    }
163}
164
165#[cfg(test)]
166mod test {
167    use super::ParseResult;
168    use std::path::Path;
169
170    #[test]
171    pub fn test_ron_archive() {
172        let path = Path::new("fixtures/archives/ron.gz");
173        let res = ParseResult::iter_from_gz(&path).unwrap();
174        let results = res.into_iter().collect::<Vec<_>>();
175        assert_eq!(results.len(), 1);
176    }
177
178    #[test]
179    pub fn test_json_archive() {
180        let path = Path::new("fixtures/archives/json.jsonl.gz");
181        let res = ParseResult::iter_from_gz(&path).unwrap();
182        let results = res.into_iter().collect::<Vec<_>>();
183        assert_eq!(results.len(), 1);
184    }
185}