ieql/input/document.rs
1//! This document provides functionality related to document handling.
2
3use common::compilation::CompilableTo;
4use common::validation::Issue;
5use query::scope::ScopeContent;
6use regex::Regex;
7use url::Url;
8use lazy_static::lazy_static;
9use htmlescape::decode_html;
10
11lazy_static! {
12 static ref HTML_REGEX: Regex = Regex::new(r"<(.*?)>").unwrap();
13 static ref SPACE_REGEX: Regex = Regex::new(r"\s{2,}").unwrap();
14}
15
16/// The `Document` struct represents any kind of document, but typically
17/// some sort of Internet document. A `Document` can often be quite large;
18/// after all, it contains the entire text of a document.
19///
20/// In practice, this struct functions more as an interim format as data becomes
21/// a `CompiledDocument`.
22#[derive(Clone)]
23pub struct Document {
24 /// `url` represents the URL of the document, if it is present.
25 ///
26 /// For internet documents, this typically takes the form of `Some("https://...")`,
27 /// whereas for local documents this typically takes the form of
28 /// `Some("/path/to/file")`.
29 pub url: Option<String>,
30 /// `data` contains the data of the document.
31 ///
32 /// This data is stored as a `Vec<u8>` primarily for first-class text
33 /// document support (`utf8`).
34 pub data: Vec<u8>,
35 /// `mime` represents a valid IETF `mime` type, as per RFC 2045.
36 pub mime: Option<String>,
37}
38
39/// A `DocumentReference` is a reference to a document that is either
40/// already loaded into memory or exists at some path. This path can,
41/// in theory, be a URL or a relative (or absolute) path on the user's
42/// local filesystem.
43///
44/// Currently, only local paths are supported. URLs will be supported
45/// in a future version of IEQL.
46///
47/// The benefit of `DocumentReference` lies primarily in multithreading.
48/// Using `DocumentReference`s allows for file IO to be parallelized.
49/// (By passing a `DocumentReference` or `DocumentReferenceBatch` to
50/// a concurrent scanner, one need not actually read the document from
51/// the disk in the main thread.)
52pub enum DocumentReference {
53 /// Represents a document that is already present in memory and
54 /// does not need to be loaded from the disk.
55 Populated(Document),
56 /// Represents a document that _has not already been loaded_. The
57 /// contained `String` is the document's path.
58 Unpopulated(String),
59}
60
61/// Represents a batch (collection in the form of a `Vec`) of
62/// `DocumentReference`s.
63///
64/// This struct is particularly useful for scanning, as it allows
65/// one function call to take many different document references.
66/// It also enables 'processing groups'—i.e. groups of documents that
67/// will always be processed together in the same thread.
68pub struct DocumentReferenceBatch {
69 /// Contains the DocumentReferences
70 pub documents: Vec<DocumentReference>,
71}
72
73/// A `CompiledDocument` is a `Document` that has been processed and
74/// is ready to be scanned. During compilation, the IEQL document compiler
75/// extracts the following information from the `Document`:
76///
77/// * **text** — the text of the document. Currently, only HTML parsing is supported.
78/// * **domain** — the domain name, if present, is also processed.
79/// * **raw** — unlike `Documents`, whose contents are bytes, `CompiledDocuments` have text.
80///
81/// In cases that the document is not HTML, `text` is identical to `raw`.
82pub struct CompiledDocument {
83 pub url: Option<String>,
84 pub raw: String,
85 pub mime: Option<String>,
86 pub text: String,
87 pub domain: Option<String>,
88}
89
90/// Represents a batch (collection in the form of a `Vec`) of `Document`s.
91pub struct DocumentBatch {
92 /// Contains the documents
93 pub documents: Vec<Document>,
94}
95
96/// Represents a batch (collection in the form of a `Vec`) of `CompiledDocument`s.
97pub struct CompiledDocumentBatch {
98 /// Contains the compiled documents
99 pub documents: Vec<CompiledDocument>,
100}
101
102/// This enum represents the various kinds of documents which support intelligent
103/// text extraction.
104enum DocumentKind {
105 Html,
106 Unknown,
107}
108
109impl Document {
110 /// This function detects the document's `DocumentKind` by looking at its path
111 /// and MIME information.
112 fn detect_document_kind(&self) -> DocumentKind {
113 // Detect HTML
114 let mut is_html = match &self.mime {
115 Some(value) => value.eq("text/html"),
116 None => false,
117 };
118 match &self.url {
119 Some(value) => {
120 if value.ends_with(".html") {
121 is_html = true;
122 }
123 }
124 None => (),
125 };
126 if is_html {
127 return DocumentKind::Html;
128 }
129
130 DocumentKind::Unknown
131 }
132
133 /// This function extracts the hostname (domain name) of a document. In cases where
134 /// the host name isn't known, this function returns `None`.
135 pub fn domain(&self) -> Option<String> {
136 let own_url = match &self.url {
137 Some(value) => value,
138 None => return None,
139 };
140 let parsed_url = match Url::parse(own_url.as_str()) {
141 Ok(url) => url,
142 Err(_) => return None,
143 };
144 match parsed_url.host_str() {
145 Some(value) => Some(String::from(value)),
146 None => None,
147 }
148 }
149
150 /// This function extracts text from the document's `data`. It assumes `utf8` encoding.
151 /// Note that this function is very different from `extract_document_text()`: this function
152 /// simply extracts text, while `extract_document_text()` also, in some cases, parses it.
153 fn raw(&self) -> String {
154 String::from_utf8_lossy(self.data.as_slice()).into_owned()
155 }
156
157 /// This function intelligently extracts text from the document—which is to say that it is
158 /// able to parse HTML documents and extract the human-readable text. Additional document types,
159 /// such as PDFs, will be supported in the future.
160 fn extract_document_text(&self) -> String {
161 match &self.detect_document_kind() {
162 DocumentKind::Html => {
163 let extracted = String::from(SPACE_REGEX.replace_all(&HTML_REGEX.replace_all(&self.raw(), " "), " "));
164 match decode_html(extracted.as_str()) {
165 Ok(value) => value,
166 Err(_) => extracted
167 }
168 },
169 DocumentKind::Unknown => self.raw(),
170 }
171 }
172}
173
174impl CompilableTo<CompiledDocument> for Document {
175 fn compile(&self) -> Result<CompiledDocument, Issue> {
176 let text = self.extract_document_text();
177 let domain = self.domain();
178 let raw = self.raw();
179 Ok(CompiledDocument {
180 url: self.url.clone(),
181 raw: raw,
182 mime: self.mime.clone(),
183 text: text,
184 domain: domain,
185 })
186 }
187}
188
189impl CompilableTo<CompiledDocumentBatch> for DocumentBatch {
190 fn compile(&self) -> Result<CompiledDocumentBatch, Issue> {
191 let mut compiled_documents: Vec<CompiledDocument> = Vec::new();
192 for document in &self.documents {
193 let compiled_document = match document.compile() {
194 Ok(value) => value,
195 Err(_error) => continue, // silent failure
196 };
197 compiled_documents.push(compiled_document);
198 }
199 Ok(CompiledDocumentBatch {
200 documents: compiled_documents,
201 })
202 }
203}
204
205impl CompiledDocument {
206 /// This function returns the document content relative to the
207 /// given `ScopeContent`. For example, if the `ScopeContent`
208 /// is `Raw`, this function will return the document's `Raw` data.
209 /// If it is `Text`, this function will return the document's parsed
210 /// text.
211 pub fn content(&self, content: ScopeContent) -> &String {
212 match content {
213 ScopeContent::Raw => &self.raw,
214 ScopeContent::Text => &self.text,
215 }
216 }
217}
218
219impl From<Vec<Document>> for DocumentBatch {
220 fn from(docs: Vec<Document>) -> DocumentBatch {
221 DocumentBatch { documents: docs }
222 }
223}
224
225impl From<Vec<DocumentReference>> for DocumentReferenceBatch {
226 fn from(docs: Vec<DocumentReference>) -> DocumentReferenceBatch {
227 DocumentReferenceBatch { documents: docs }
228 }
229}