ieql/input/
document.rs

1//! This document provides functionality related to document handling.
2
3use common::compilation::CompilableTo;
4use common::validation::Issue;
5use query::scope::ScopeContent;
6use regex::Regex;
7use url::Url;
8use lazy_static::lazy_static;
9use htmlescape::decode_html;
10
11lazy_static! {
12    static ref HTML_REGEX: Regex = Regex::new(r"<(.*?)>").unwrap();
13    static ref SPACE_REGEX: Regex = Regex::new(r"\s{2,}").unwrap();
14}
15
16/// The `Document` struct represents any kind of document, but typically
17/// some sort of Internet document. A `Document` can often be quite large;
18/// after all, it contains the entire text of a document.
19///
20/// In practice, this struct functions more as an interim format as data becomes
21/// a `CompiledDocument`.
22#[derive(Clone)]
23pub struct Document {
24    /// `url` represents the URL of the document, if it is present.
25    ///
26    /// For internet documents, this typically takes the form of `Some("https://...")`,
27    /// whereas for local documents this typically takes the form of
28    /// `Some("/path/to/file")`.
29    pub url: Option<String>,
30    /// `data` contains the data of the document.
31    ///
32    /// This data is stored as a `Vec<u8>` primarily for first-class text
33    /// document support (`utf8`).
34    pub data: Vec<u8>,
35    /// `mime` represents a valid IETF `mime` type, as per RFC 2045.
36    pub mime: Option<String>,
37}
38
39/// A `DocumentReference` is a reference to a document that is either
40/// already loaded into memory or exists at some path. This path can,
41/// in theory, be a URL or a relative (or absolute) path on the user's
42/// local filesystem.
43///
44/// Currently, only local paths are supported. URLs will be supported
45/// in a future version of IEQL.
46///
47/// The benefit of `DocumentReference` lies primarily in multithreading.
48/// Using `DocumentReference`s allows for file IO to be parallelized.
49/// (By passing a `DocumentReference` or `DocumentReferenceBatch` to
50/// a concurrent scanner, one need not actually read the document from
51/// the disk in the main thread.)
52pub enum DocumentReference {
53    /// Represents a document that is already present in memory and
54    /// does not need to be loaded from the disk.
55    Populated(Document),
56    /// Represents a document that _has not already been loaded_. The
57    /// contained `String` is the document's path.
58    Unpopulated(String),
59}
60
61/// Represents a batch (collection in the form of a `Vec`) of
62/// `DocumentReference`s.
63///
64/// This struct is particularly useful for scanning, as it allows
65/// one function call to take many different document references.
66/// It also enables 'processing groups'—i.e. groups of documents that
67/// will always be processed together in the same thread.
68pub struct DocumentReferenceBatch {
69    /// Contains the DocumentReferences
70    pub documents: Vec<DocumentReference>,
71}
72
73/// A `CompiledDocument` is a `Document` that has been processed and
74/// is ready to be scanned. During compilation, the IEQL document compiler
75/// extracts the following information from the `Document`:
76///
77/// * **text** — the text of the document. Currently, only HTML parsing is supported.
78/// * **domain** — the domain name, if present, is also processed.
79/// * **raw** — unlike `Documents`, whose contents are bytes, `CompiledDocuments` have text.
80///
81/// In cases that the document is not HTML, `text` is identical to `raw`.
82pub struct CompiledDocument {
83    pub url: Option<String>,
84    pub raw: String,
85    pub mime: Option<String>,
86    pub text: String,
87    pub domain: Option<String>,
88}
89
90/// Represents a batch (collection in the form of a `Vec`) of `Document`s.
91pub struct DocumentBatch {
92    /// Contains the documents
93    pub documents: Vec<Document>,
94}
95
96/// Represents a batch (collection in the form of a `Vec`) of `CompiledDocument`s.
97pub struct CompiledDocumentBatch {
98    /// Contains the compiled documents
99    pub documents: Vec<CompiledDocument>,
100}
101
102/// This enum represents the various kinds of documents which support intelligent
103/// text extraction.
104enum DocumentKind {
105    Html,
106    Unknown,
107}
108
109impl Document {
110    /// This function detects the document's `DocumentKind` by looking at its path
111    /// and MIME information.
112    fn detect_document_kind(&self) -> DocumentKind {
113        // Detect HTML
114        let mut is_html = match &self.mime {
115            Some(value) => value.eq("text/html"),
116            None => false,
117        };
118        match &self.url {
119            Some(value) => {
120                if value.ends_with(".html") {
121                    is_html = true;
122                }
123            }
124            None => (),
125        };
126        if is_html {
127            return DocumentKind::Html;
128        }
129
130        DocumentKind::Unknown
131    }
132
133    /// This function extracts the hostname (domain name) of a document. In cases where
134    /// the host name isn't known, this function returns `None`.
135    pub fn domain(&self) -> Option<String> {
136        let own_url = match &self.url {
137            Some(value) => value,
138            None => return None,
139        };
140        let parsed_url = match Url::parse(own_url.as_str()) {
141            Ok(url) => url,
142            Err(_) => return None,
143        };
144        match parsed_url.host_str() {
145            Some(value) => Some(String::from(value)),
146            None => None,
147        }
148    }
149
150    /// This function extracts text from the document's `data`. It assumes `utf8` encoding.
151    /// Note that this function is very different from `extract_document_text()`: this function
152    /// simply extracts text, while `extract_document_text()` also, in some cases, parses it.
153    fn raw(&self) -> String {
154        String::from_utf8_lossy(self.data.as_slice()).into_owned()
155    }
156
157    /// This function intelligently extracts text from the document—which is to say that it is
158    /// able to parse HTML documents and extract the human-readable text. Additional document types,
159    /// such as PDFs, will be supported in the future.
160    fn extract_document_text(&self) -> String {
161        match &self.detect_document_kind() {
162            DocumentKind::Html => {
163                let extracted = String::from(SPACE_REGEX.replace_all(&HTML_REGEX.replace_all(&self.raw(), " "), " "));
164                match decode_html(extracted.as_str()) {
165                    Ok(value) => value,
166                    Err(_) => extracted
167                }
168            },
169            DocumentKind::Unknown => self.raw(),
170        }
171    }
172}
173
174impl CompilableTo<CompiledDocument> for Document {
175    fn compile(&self) -> Result<CompiledDocument, Issue> {
176        let text = self.extract_document_text();
177        let domain = self.domain();
178        let raw = self.raw();
179        Ok(CompiledDocument {
180            url: self.url.clone(),
181            raw: raw,
182            mime: self.mime.clone(),
183            text: text,
184            domain: domain,
185        })
186    }
187}
188
189impl CompilableTo<CompiledDocumentBatch> for DocumentBatch {
190    fn compile(&self) -> Result<CompiledDocumentBatch, Issue> {
191        let mut compiled_documents: Vec<CompiledDocument> = Vec::new();
192        for document in &self.documents {
193            let compiled_document = match document.compile() {
194                Ok(value) => value,
195                Err(_error) => continue, // silent failure
196            };
197            compiled_documents.push(compiled_document);
198        }
199        Ok(CompiledDocumentBatch {
200            documents: compiled_documents,
201        })
202    }
203}
204
205impl CompiledDocument {
206    /// This function returns the document content relative to the
207    /// given `ScopeContent`. For example, if the `ScopeContent`
208    /// is `Raw`, this function will return the document's `Raw` data.
209    /// If it is `Text`, this function will return the document's parsed
210    /// text.
211    pub fn content(&self, content: ScopeContent) -> &String {
212        match content {
213            ScopeContent::Raw => &self.raw,
214            ScopeContent::Text => &self.text,
215        }
216    }
217}
218
219impl From<Vec<Document>> for DocumentBatch {
220    fn from(docs: Vec<Document>) -> DocumentBatch {
221        DocumentBatch { documents: docs }
222    }
223}
224
225impl From<Vec<DocumentReference>> for DocumentReferenceBatch {
226    fn from(docs: Vec<DocumentReference>) -> DocumentReferenceBatch {
227        DocumentReferenceBatch { documents: docs }
228    }
229}