infisearch/loader/
html.rs

1use std::iter::FromIterator;
2use std::path::Path;
3use std::path::PathBuf;
4use std::sync::Arc;
5
6use log::error;
7use path_slash::PathExt;
8use rustc_hash::FxHashMap;
9use rustc_hash::FxHashSet;
10use scraper::ElementRef;
11use scraper::Html;
12use scraper::Selector;
13use serde::{Deserialize, Deserializer, Serialize, Serializer};
14
15use crate::field_info::RELATIVE_FP_FIELD;
16use crate::loader::Loader;
17use crate::loader::LoaderResult;
18use crate::loader::LoaderResultIterator;
19use crate::worker::miner::{DEFAULT_ZONE_SEPARATION, Zone};
20
21const HTML_ZONE_SEPARATION: u32 = 3;
22const SEPARATOR_EL_SEPARATION: u32 = 0;
23
24pub struct HtmlLoaderSelector {
25    selector: Selector,
26    field_name: Option<String>,
27    attr_map: FxHashMap<String, String>,
28}
29
30#[derive(Serialize, Deserialize)]
31pub struct HtmlLoaderSelectorRaw {
32    #[serde(default)]
33    priority: u32,
34    field_name: Option<String>,
35    #[serde(default)]
36    attr_map: FxHashMap<String, String>,
37}
38
39fn get_default_html_loader_selectors() -> FxHashMap<String, Option<HtmlLoaderSelectorRaw>> {
40    FxHashMap::from_iter(vec![
41        (
42            "span[data-infisearch-link]".to_owned(),
43            Some(HtmlLoaderSelectorRaw {
44                priority: 0,
45                field_name: None,
46                attr_map: FxHashMap::from_iter(vec![
47                    ("data-infisearch-link".to_owned(), "link".to_owned())
48                ]),
49            })
50        ),
51        (
52            "title".to_owned(),
53            Some(HtmlLoaderSelectorRaw {
54                priority: 0,
55                field_name: Some("title".to_owned()),
56                attr_map: FxHashMap::default(),
57            })
58        ),
59        (
60            "h1".to_owned(),
61            Some(HtmlLoaderSelectorRaw {
62                priority: 0,
63                field_name: Some("h1".to_owned()),
64                attr_map: FxHashMap::default(),
65            })
66        ),
67        (
68            "body".to_owned(),
69            Some(HtmlLoaderSelectorRaw {
70                priority: 0,
71                field_name: Some("body".to_owned()),
72                attr_map: FxHashMap::default(),
73            })
74        ),
75        (
76            "meta[name=\"description\"],meta[name=\"keywords\"]".to_owned(),
77            Some(HtmlLoaderSelectorRaw {
78                priority: 0,
79                field_name: None,
80                attr_map: FxHashMap::from_iter(vec![
81                    ("content".to_owned(), "body".to_owned())
82                ]),
83            })
84        ),
85        (
86            "h2,h3,h4,h5,h6".to_owned(),
87            Some(HtmlLoaderSelectorRaw {
88                priority: 0,
89                field_name: Some("heading".to_owned()),
90                attr_map: FxHashMap::from_iter(vec![
91                    ("id".to_owned(), "headingLink".to_owned())
92                ]),
93            })
94        ),
95    ])
96}
97
98fn get_default_merge_default_selectors() -> bool {
99    true
100}
101
102fn get_default_exclude_selectors() -> Vec<String> {
103    vec!["script,style,form,nav,[data-infisearch-ignore]".to_owned()]
104}
105
106#[derive(Serialize, Deserialize)]
107pub struct HtmlLoaderOptionsRaw {
108    #[serde(default)]
109    selectors: FxHashMap<String, Option<HtmlLoaderSelectorRaw>>,
110    #[serde(default = "get_default_merge_default_selectors")]
111    merge_default_selectors: bool,
112    #[serde(default = "get_default_exclude_selectors")]
113    exclude_selectors: Vec<String>,
114}
115
116pub struct HtmlLoaderOptions {
117    selectors: Vec<HtmlLoaderSelector>,
118    exclude_selectors: Vec<Selector>,
119}
120
121pub struct HtmlLoader {
122    pub raw_options: HtmlLoaderOptionsRaw,
123    pub options: Arc<HtmlLoaderOptions>,
124}
125
126struct HtmlLoaderResult {
127    link: String,
128    text: String,
129    options: Arc<HtmlLoaderOptions>,
130    absolute_path: PathBuf,
131}
132
133impl HtmlLoader {
134    pub fn get_new_html_loader(config: serde_json::Value) -> Box<Self> {
135        let mut html_loader_options_raw: HtmlLoaderOptionsRaw =
136            serde_json::from_value(config).expect("HtmlLoader options did not match schema!");
137
138        // --------------------------------------------------------------
139        // Merge/update the default selectors
140        if html_loader_options_raw.merge_default_selectors {
141            let mut selectors = get_default_html_loader_selectors();
142            std::mem::swap(&mut selectors, &mut html_loader_options_raw.selectors);
143            html_loader_options_raw.selectors.extend(selectors);
144        }
145        // --------------------------------------------------------------
146        
147        let mut selectors: Vec<_> = html_loader_options_raw.selectors.iter()
148            .filter_map(|(selector, opt)| opt.as_ref().map(|opt| (selector, opt)))    
149            .collect();
150        selectors.sort_by_key(|(_selector, opt)| opt.priority);
151        selectors.reverse();
152
153        let options = Arc::new(HtmlLoaderOptions {
154            selectors: selectors
155                .into_iter()
156                .map(|(selector, opt)| HtmlLoaderSelector {
157                    selector: Selector::parse(selector).expect("Invalid selector!"),
158                    field_name: opt.field_name.clone(),
159                    attr_map: opt.attr_map.clone(),
160                })
161                .collect(),
162            exclude_selectors: html_loader_options_raw
163                .exclude_selectors
164                .iter()
165                .map(|selector| Selector::parse(selector).expect("Invalid exclude selector!"))
166                .collect(),
167        });
168
169        Box::new(HtmlLoader { raw_options: html_loader_options_raw, options })
170    }
171}
172
173impl Loader for HtmlLoader {
174    fn try_index_file<'a>(
175        &'a self,
176        absolute_path: &Path,
177        relative_path: &Path,
178    ) -> Option<LoaderResultIterator<'a>> {
179        if let Some(extension) = relative_path.extension() {
180            if extension == "html" {
181                let absolute_path_as_buf = PathBuf::from(absolute_path);
182
183                if let Some(relative_path) = relative_path.to_slash() {
184                    return Some(Box::new(std::iter::once(Box::new(HtmlLoaderResult {
185                        link: relative_path.into_owned(),
186                        text: std::fs::read_to_string(absolute_path).expect("Failed to read file!"),
187                        options: self.options.clone(),
188                        absolute_path: absolute_path_as_buf,
189                    }) as Box<dyn LoaderResult + Send>)));
190                } else {
191                    error!("Unable to index {} containing non-unicode characters", relative_path.to_slash_lossy());
192                }
193            }
194        }
195
196        None
197    }
198
199    fn get_name(&self) -> String {
200        "HtmlLoader".to_owned()
201    }
202}
203
204impl Serialize for HtmlLoader {
205    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
206    where
207        S: Serializer,
208    {
209        self.raw_options.serialize(serializer)
210    }
211}
212
213impl<'de> Deserialize<'de> for HtmlLoader {
214    fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
215    where
216        D: Deserializer<'de>,
217    {
218        panic!("Called deserialize for HtmlLoader")
219    }
220}
221
222lazy_static! {
223    /*
224     Elements that should, in 99.9% of cases indicate a separation and likely directly contain text.
225     Forcibly add a field separation in this case to prevent non-language-separated
226     tokens from being joined together mistakenly.
227     (e.g. "<td>a</td><td>b</td>" wrongly tokenized as "ab")
228    */
229    static ref SEPARATED_ELEMENTS: FxHashSet<&'static str> = FxHashSet::from_iter([
230        "td", "th",
231        "li", "dt", "dd",
232        "h1", "h2", "h3", "h4", "h5", "h6",
233        "hr", "br",
234        "caption", "figcaption", "blockquote",
235        "footer", "header", "main", "aside", "section", "article", "nav",
236        "pre", "kbd", "p",
237        "summary",
238        "textarea",
239        "label", "button", "legend", "option"
240    ]);
241}
242
243impl HtmlLoaderResult {
244    fn traverse_node<'a>(
245        &'a self,
246        node: ElementRef,
247        // This controls whether to add a new Zone if the field_name is the same
248        do_separate: &mut bool,
249        field_texts: &mut Vec<Zone>,
250        mut field_name_opt: Option<&'a String>,
251    ) {
252        for html_loader_selector in self.options.selectors.iter() {
253            if html_loader_selector.selector.matches(&node) {
254                for (attr_name, attr_field_name) in html_loader_selector.attr_map.iter() {
255                    if let Some(attr) = node.value().attr(attr_name) {
256                        field_texts.push(Zone {
257                            field_name: attr_field_name.to_owned(),
258                            field_text: attr.to_owned(),
259                            separation: 2,
260                        });
261                    }
262                }
263
264                if let Some(selector_field_name) = &html_loader_selector.field_name {
265                    field_name_opt = Some(selector_field_name);
266                    break;
267                }
268            }
269        }
270
271        if let Some(field_name) = field_name_opt {
272            // If there is no field name,
273            // then nothing will be indexed and separator elements need not be handled
274
275            let is_separator_el = SEPARATED_ELEMENTS.contains(node.value().name());
276
277            // Tell the parent context to separate other text before this element too
278            *do_separate = *do_separate || is_separator_el;
279
280            for child in node.children() {
281                if let Some(el_child) = ElementRef::wrap(child) {
282                    self.traverse_node(el_child, do_separate, field_texts, field_name_opt);
283                } else if let Some(text) = child.value().as_text() {
284                    let last = unsafe { field_texts.last_mut().unwrap_unchecked() };
285                    if last.field_name.as_str() == field_name.as_str() {
286                        if *do_separate {
287                            field_texts.push(Zone {
288                                field_name: field_name.to_owned(),
289                                field_text: text.to_string(),
290                                separation: SEPARATOR_EL_SEPARATION,
291                            });
292                            *do_separate = false;
293                        } else {
294                            last.field_text += text;
295                        }
296                    } else {
297                        field_texts.push(Zone {
298                            field_name: field_name.to_owned(),
299                            field_text: text.to_string(),
300                            separation: HTML_ZONE_SEPARATION,
301                        });
302                        *do_separate = false;
303                    }
304                }
305            }
306
307            // Tell the parent context to separate other text after this element
308            *do_separate = *do_separate || is_separator_el;
309        } else {
310            for child in node.children() {
311                if let Some(el_child) = ElementRef::wrap(child) {
312                    self.traverse_node(el_child, do_separate, field_texts, field_name_opt);
313                }
314            }
315        }
316    }
317}
318
319impl LoaderResult for HtmlLoaderResult {
320    fn get_field_texts_and_path(mut self: Box<Self>) -> (Vec<Zone>, PathBuf) {
321        let mut field_texts: Vec<Zone> = Vec::with_capacity(20);
322        let mut document = Html::parse_document(&self.text);
323        let mut do_separate = false;
324
325        field_texts.push(Zone {
326            field_name: RELATIVE_FP_FIELD.to_owned(),
327            field_text: std::mem::take(&mut self.link),
328            separation: DEFAULT_ZONE_SEPARATION,
329        });
330
331        for selector in self.options.exclude_selectors.iter() {
332            let ids: Vec<_> = document.select(selector).map(|selected| selected.id()).collect();
333            for id in ids {
334                document.tree.get_mut(id).unwrap().detach();
335            }
336        }
337
338        for child in document.tree.root().children() {
339            if let Some(el_child) = ElementRef::wrap(child) {
340                self.traverse_node(el_child, &mut do_separate, &mut field_texts, None);
341            }
342        }
343
344        (field_texts, self.absolute_path)
345    }
346}
347
348#[cfg(test)]
349mod test {
350    use std::{sync::Arc, path::PathBuf};
351
352    use scraper::Selector;
353
354    use crate::{loader::LoaderResult, worker::miner::{Zone, DEFAULT_ZONE_SEPARATION}, field_info::RELATIVE_FP_FIELD};
355
356    use super::{
357        HtmlLoaderOptions,
358        HtmlLoaderResult,
359        HtmlLoaderSelector,
360        get_default_html_loader_selectors,
361        get_default_exclude_selectors, HTML_ZONE_SEPARATION, SEPARATOR_EL_SEPARATION,
362    };
363
364    fn get_test_loader_options() -> HtmlLoaderOptions {
365        HtmlLoaderOptions {
366            selectors: get_default_html_loader_selectors()
367                .into_iter()
368                .filter_map(|(selector, opt)| opt.map(|opt| HtmlLoaderSelector {
369                    selector: Selector::parse(&selector).expect("Invalid selector!"),
370                    field_name: opt.field_name.clone(),
371                    attr_map: opt.attr_map.clone(),
372                }))
373                .collect(),
374            exclude_selectors: get_default_exclude_selectors()
375                .iter()
376                .map(|selector| Selector::parse(selector).expect("Invalid exclude selector!"))
377                .collect(),
378        }
379    }
380
381    #[test]
382    fn test_separation() {
383        let loader_result = Box::new(HtmlLoaderResult {
384            link: String::new(),
385            text: "text before".to_owned()
386                + "<table>"
387                + "<thead>"
388                + "<tr><th>o<button>n</button>e</th><th>two</th></tr>"
389                + "</thead><tbody>"
390                + "<tr><th>three</td><td>four</td></tr>"
391                + "<tr><th>five</td><td>si<button>x</button></td></tr>"
392                + "</tbody>"
393                + "</table>"
394                + "text after"
395                + "<h2><span>test</span> text</h2>",
396            options: Arc::from(get_test_loader_options()),
397            absolute_path: PathBuf::new(),
398        });
399
400        let (zones, _path) = loader_result.get_field_texts_and_path();
401        assert_eq!(zones, vec![
402            Zone {
403                field_name: RELATIVE_FP_FIELD.to_owned(),
404                field_text: "".to_owned(),
405                separation: DEFAULT_ZONE_SEPARATION,
406            },
407            Zone {
408                field_name: "body".to_owned(),
409                field_text: "text before".to_owned(),
410                separation: HTML_ZONE_SEPARATION,
411            },
412            Zone {
413                field_name: "body".to_owned(),
414                field_text: "o".to_owned(),
415                separation: SEPARATOR_EL_SEPARATION,
416            },
417            Zone {
418                field_name: "body".to_owned(),
419                field_text: "n".to_owned(),
420                separation: SEPARATOR_EL_SEPARATION,
421            },
422            Zone {
423                field_name: "body".to_owned(),
424                field_text: "e".to_owned(),
425                separation: SEPARATOR_EL_SEPARATION,
426            },
427            Zone {
428                field_name: "body".to_owned(),
429                field_text: "two".to_owned(),
430                separation: SEPARATOR_EL_SEPARATION,
431            },
432            Zone {
433                field_name: "body".to_owned(),
434                field_text: "three".to_owned(),
435                separation: SEPARATOR_EL_SEPARATION,
436            },
437            Zone {
438                field_name: "body".to_owned(),
439                field_text: "four".to_owned(),
440                separation: SEPARATOR_EL_SEPARATION,
441            },
442            Zone {
443                field_name: "body".to_owned(),
444                field_text: "five".to_owned(),
445                separation: SEPARATOR_EL_SEPARATION,
446            },
447            Zone {
448                field_name: "body".to_owned(),
449                field_text: "si".to_owned(),
450                separation: SEPARATOR_EL_SEPARATION,
451            },
452            Zone {
453                field_name: "body".to_owned(),
454                field_text: "x".to_owned(),
455                separation: SEPARATOR_EL_SEPARATION,
456            },
457            Zone {
458                field_name: "body".to_owned(),
459                field_text: "text after".to_owned(),
460                separation: SEPARATOR_EL_SEPARATION,
461            },
462            Zone {
463                field_name: "heading".to_owned(),
464                field_text: "test text".to_owned(),
465                separation: HTML_ZONE_SEPARATION,
466            },
467        ])
468    }
469}