1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
use blake2::{Blake2s256, Digest};
use flate2::read::GzDecoder;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
pub mod html;
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct ParseResult {
pub canonical_url: Option<String>,
pub content: String,
pub content_hash: String,
pub description: String,
#[serde(skip)]
pub links: HashSet<String>,
pub meta: HashMap<String, String>,
pub title: Option<String>,
}
impl ParseResult {
pub fn builder() -> ParseResultBuilder {
ParseResultBuilder::new()
}
pub fn iter_from_gz(file: &Path) -> anyhow::Result<ParseResultGzIterator> {
let file = File::open(file)?;
Ok(ParseResultGzIterator::new(BufReader::new(GzDecoder::new(
file,
))))
}
}
type GzBufReader = BufReader<GzDecoder<File>>;
pub struct ParseResultGzIterator {
reader: GzBufReader,
buffer: String,
}
impl ParseResultGzIterator {
pub fn new(reader: GzBufReader) -> Self {
Self {
reader,
buffer: String::new(),
}
}
}
impl Iterator for ParseResultGzIterator {
type Item = ParseResult;
fn next(&mut self) -> Option<Self::Item> {
self.buffer.clear();
if let Ok(read) = self.reader.read_line(&mut self.buffer) {
if read == 0 {
return None;
}
if let Ok(res) = ron::de::from_str::<ParseResult>(&self.buffer) {
Some(res)
} else {
None
}
} else {
None
}
}
}
impl Default for ParseResultBuilder {
fn default() -> Self {
Self::new()
}
}
pub struct ParseResultBuilder {
result: ParseResult,
}
impl ParseResultBuilder {
pub fn build(self) -> ParseResult {
self.result
}
pub fn new() -> Self {
ParseResultBuilder {
result: ParseResult::default(),
}
}
pub fn canonical_url(mut self, url: Option<String>) -> Self {
self.result.canonical_url = url;
self
}
pub fn content(mut self, content: String) -> Self {
let mut hasher = Blake2s256::new();
hasher.update(content.clone());
let res = hasher.finalize();
self.result.content = content;
self.result.content_hash = hex::encode(res);
self
}
pub fn description(mut self, desc: String) -> Self {
self.result.description = desc;
self
}
pub fn links(mut self, links: HashSet<String>) -> Self {
self.result.links = links;
self
}
pub fn meta(mut self, meta: HashMap<String, String>) -> Self {
self.result.meta = meta;
self
}
pub fn title(mut self, title: Option<String>) -> Self {
self.result.title = title;
self
}
}