libnetrunner/parser/
mod.rs1use blake2::{Blake2s256, Digest};
2use flate2::read::GzDecoder;
3use serde::{Deserialize, Serialize};
4use std::collections::{HashMap, HashSet};
5use std::fs::File;
6use std::io::{BufRead, BufReader};
7use std::path::Path;
8
9pub mod html;
10
11#[derive(Debug, Default, Clone, Serialize, Deserialize)]
12pub struct ParseResult {
13 pub canonical_url: Option<String>,
15 pub content: String,
18 pub content_hash: String,
20 pub description: String,
22 #[serde(skip)]
24 pub links: HashSet<String>,
25 pub meta: HashMap<String, String>,
27 pub title: Option<String>,
29}
30
31impl ParseResult {
32 pub fn builder() -> ParseResultBuilder {
33 ParseResultBuilder::new()
34 }
35
36 pub fn iter_from_gz(file: &Path) -> anyhow::Result<ParseResultGzIterator> {
37 let file_name = file
38 .file_name()
39 .map(|f| f.to_string_lossy())
40 .unwrap_or_default();
41 let file_format = if file_name.contains(".jsonl.gz") {
42 ParseResultFormat::Json
43 } else {
44 ParseResultFormat::Ron
45 };
46
47 let file = File::open(file)?;
48 Ok(ParseResultGzIterator::new(
49 file_format,
50 BufReader::new(GzDecoder::new(file)),
51 ))
52 }
53}
54
55#[derive(Debug)]
56pub enum ParseResultFormat {
57 Json,
58 Ron,
59}
60
61type GzBufReader = BufReader<GzDecoder<File>>;
62pub struct ParseResultGzIterator {
63 file_format: ParseResultFormat,
64 reader: GzBufReader,
65 buffer: String,
66}
67
68impl ParseResultGzIterator {
71 pub fn new(file_format: ParseResultFormat, reader: GzBufReader) -> Self {
72 Self {
73 file_format,
74 reader,
75 buffer: String::new(),
76 }
77 }
78}
79
80impl Iterator for ParseResultGzIterator {
81 type Item = ParseResult;
82 fn next(&mut self) -> Option<Self::Item> {
83 self.buffer.clear();
84 if let Ok(read) = self.reader.read_line(&mut self.buffer) {
85 if read == 0 {
86 return None;
87 }
88
89 match self.file_format {
90 ParseResultFormat::Json => {
91 if let Ok(res) = serde_json::de::from_str::<ParseResult>(&self.buffer) {
92 return Some(res);
93 }
94 }
95 ParseResultFormat::Ron => {
96 if let Ok(res) = ron::de::from_str::<ParseResult>(&self.buffer) {
97 return Some(res);
98 }
99 }
100 }
101 }
102
103 None
104 }
105}
106
107impl Default for ParseResultBuilder {
108 fn default() -> Self {
109 Self::new()
110 }
111}
112
113pub struct ParseResultBuilder {
114 result: ParseResult,
115}
116
117impl ParseResultBuilder {
118 pub fn build(self) -> ParseResult {
119 self.result
120 }
121
122 pub fn new() -> Self {
123 ParseResultBuilder {
124 result: ParseResult::default(),
125 }
126 }
127
128 pub fn canonical_url(mut self, url: Option<String>) -> Self {
129 self.result.canonical_url = url;
130 self
131 }
132
133 pub fn content(mut self, content: String) -> Self {
134 let mut hasher = Blake2s256::new();
135 hasher.update(content.clone());
136 let res = hasher.finalize();
137
138 self.result.content = content;
139 self.result.content_hash = hex::encode(res);
140
141 self
142 }
143
144 pub fn description(mut self, desc: String) -> Self {
145 self.result.description = desc;
146 self
147 }
148
149 pub fn links(mut self, links: HashSet<String>) -> Self {
150 self.result.links = links;
151 self
152 }
153
154 pub fn meta(mut self, meta: HashMap<String, String>) -> Self {
155 self.result.meta = meta;
156 self
157 }
158
159 pub fn title(mut self, title: Option<String>) -> Self {
160 self.result.title = title;
161 self
162 }
163}
164
165#[cfg(test)]
166mod test {
167 use super::ParseResult;
168 use std::path::Path;
169
170 #[test]
171 pub fn test_ron_archive() {
172 let path = Path::new("fixtures/archives/ron.gz");
173 let res = ParseResult::iter_from_gz(&path).unwrap();
174 let results = res.into_iter().collect::<Vec<_>>();
175 assert_eq!(results.len(), 1);
176 }
177
178 #[test]
179 pub fn test_json_archive() {
180 let path = Path::new("fixtures/archives/json.jsonl.gz");
181 let res = ParseResult::iter_from_gz(&path).unwrap();
182 let results = res.into_iter().collect::<Vec<_>>();
183 assert_eq!(results.len(), 1);
184 }
185}