1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
use blake2::{Blake2s256, Digest};
use flate2::read::GzDecoder;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

pub mod html;

#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct ParseResult {
    /// Index should use this URL instead of the one that lead to the content.
    pub canonical_url: Option<String>,
    /// Text content from page after stripping HTML tags & any semantically
    /// unimportant sections (header/footer/etc.)
    pub content: String,
    /// Used to determine whether document content has changed.
    pub content_hash: String,
    /// Page description, extracted from meta tags or summarized from the actual content
    pub description: String,
    /// Links found in the page.
    #[serde(skip)]
    pub links: HashSet<String>,
    /// Meta (OpenGraph, etc) tags associated w/ this content.
    pub meta: HashMap<String, String>,
    /// Title of the page, document, etc.
    pub title: Option<String>,
}

impl ParseResult {
    pub fn builder() -> ParseResultBuilder {
        ParseResultBuilder::new()
    }

    pub fn iter_from_gz(file: &Path) -> anyhow::Result<ParseResultGzIterator> {
        let file = File::open(file)?;
        Ok(ParseResultGzIterator::new(BufReader::new(GzDecoder::new(
            file,
        ))))
    }
}

type GzBufReader = BufReader<GzDecoder<File>>;
pub struct ParseResultGzIterator {
    reader: GzBufReader,
    buffer: String,
}

/// Utility iterator that reads in lines from a gzipped archive of serialized
/// ParseResults
impl ParseResultGzIterator {
    pub fn new(reader: GzBufReader) -> Self {
        Self {
            reader,
            buffer: String::new(),
        }
    }
}

impl Iterator for ParseResultGzIterator {
    type Item = ParseResult;
    fn next(&mut self) -> Option<Self::Item> {
        self.buffer.clear();
        if let Ok(read) = self.reader.read_line(&mut self.buffer) {
            if read == 0 {
                return None;
            }

            if let Ok(res) = ron::de::from_str::<ParseResult>(&self.buffer) {
                Some(res)
            } else {
                None
            }
        } else {
            None
        }
    }
}

impl Default for ParseResultBuilder {
    fn default() -> Self {
        Self::new()
    }
}

pub struct ParseResultBuilder {
    result: ParseResult,
}

impl ParseResultBuilder {
    pub fn build(self) -> ParseResult {
        self.result
    }

    pub fn new() -> Self {
        ParseResultBuilder {
            result: ParseResult::default(),
        }
    }

    pub fn canonical_url(mut self, url: Option<String>) -> Self {
        self.result.canonical_url = url;
        self
    }

    pub fn content(mut self, content: String) -> Self {
        let mut hasher = Blake2s256::new();
        hasher.update(content.clone());
        let res = hasher.finalize();

        self.result.content = content;
        self.result.content_hash = hex::encode(res);

        self
    }

    pub fn description(mut self, desc: String) -> Self {
        self.result.description = desc;
        self
    }

    pub fn links(mut self, links: HashSet<String>) -> Self {
        self.result.links = links;
        self
    }

    pub fn meta(mut self, meta: HashMap<String, String>) -> Self {
        self.result.meta = meta;
        self
    }

    pub fn title(mut self, title: Option<String>) -> Self {
        self.result.title = title;
        self
    }
}