netscape_to_universal/
lib.rs

1use crate::node_ref_ext::*;
2use kuchiki::{parse_html, traits::TendrilSink, NodeRef};
3use std::{
4    fs::File,
5    io::{self, Error, ErrorKind, Read, Write},
6    path::PathBuf,
7};
8
9mod node_ref_ext;
10
11pub fn read_path(input_path: PathBuf) -> Result<Box<dyn Read>, Error> {
12    if input_path.as_os_str() == "-" {
13        // Get input from stdin
14        Ok(Box::new(io::stdin()) as Box<dyn Read>)
15    } else {
16        // Get input from file
17        match File::open(&input_path) {
18            Ok(file) => Ok(Box::new(file) as Box<dyn Read>),
19            Err(err) => {
20                let msg = format!("{}: {}", input_path.display(), err);
21                Err(Error::new(ErrorKind::InvalidData, msg))
22            }
23        }
24    }
25}
26
27pub fn convert<R: Read, W: Write>(input: &mut R, output: &mut W) -> Result<(), Error> {
28    let node = parse_html().from_utf8().read_from(input).unwrap();
29    to_universal(node, output)
30}
31
32// 1. Traverse the tree in a breadth-first way
33// 2. On each tree level, store parent of the children nodes
34// that are to be scanned next.
35// 3. For each children node that is a leaf,
36// set its list of ancestors, using the ancestor list of the parent
37// and adding the parrent to the list.
38// Output the node's href and the node's ancestor list as list of tags (URI #tag1 #tag2...)
39// 4. For each children that is not a leaf,
40// continue the scan (step 2).
41fn to_universal(node: NodeRef, output: &mut dyn Write) -> Result<(), Error> {
42    if let Some(root) = node.children().find(|n| n.is_element("HTML")) {
43        if let Some(body) = root.children().find(|child| child.is_element("BODY")) {
44            if let Some(content) = body.children().find(|child| child.is_element("DL")) {
45                for item in content.children() {
46                    to_universal_rec(&item, vec![], output)?;
47                }
48            } else {
49                return Err(Error::new(
50                    ErrorKind::InvalidData,
51                    "Invalid file format: missing content element DL.",
52                ));
53            }
54        }
55    }
56    Ok(())
57}
58
59fn to_universal_rec(
60    node: &NodeRef,
61    mut ancestors: Vec<String>,
62    output: &mut dyn Write,
63) -> Result<(), Error> {
64    if node.is_element("DT") {
65        // See if the node is a bookmark
66        if let Some(node_a) = node.children().find(|n| n.is_element("A")) {
67            if let Some(attribute) = node_a.select_attribute("HREF") {
68                writeln!(
69                    output,
70                    "{}{}{}",
71                    attribute.value,
72                    if ancestors.is_empty() { "" } else { " " },
73                    ancestors.join(" ")
74                )?;
75                return Ok(());
76            }
77        }
78
79        // At this point we know that the node is not a bookmark
80        // and so we verify whether the node is a folder
81        if let Some(node_h3) = node.children().find(|n| n.is_element("H3")) {
82            let title = node_h3.text_contents();
83            // Add title to list of ancestors for the children nodes to come
84            ancestors.push(format!("#{}", title.replace(' ', "-")));
85            // Look for children
86            for sibling in node_h3.following_siblings() {
87                if sibling.is_element("DL") {
88                    for child in sibling.children() {
89                        to_universal_rec(&child, ancestors.clone(), output)?;
90                    }
91                }
92            }
93        }
94    }
95    Ok(())
96}