libzettels 0.4.1

A library intended as a backend for applications which implement Niklas Luhmann's system of a 'Zettelkasten'.
Documentation
//Copyright (c) 2020-2022 Stefan Thesing
//
//This file is part of libzettels.
//
//libzettels is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//libzettels is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with Zettels. If not, see http://www.gnu.org/licenses/.

//! Auxiliary module for building and updating the index. 
//! To be more specific, this is used to parse markdown links (the inline
//! variant, only) in zettel files to extract the target of these links.
//! This module is used when the corresponding field of the [Config](struct.Config.html)  
//! is set to `IndexingMethod::Native`.
//! Compared to the other variants of `IndexingMethod`, this is probably slow.
// --------------------------------------------------------------------------

// External imports
use std::io::{BufRead, BufReader};

use regex::Regex;

// Internal imports
use super::*;

// --------------------------------------------------------------------------
// Functions
// --------------------------------------------------------------------------

/// Takes a reference to a regular expression that matches markdown links and a
/// line of text to be parsed as `&str`.
/// It returns a Vector of these links as strings.
fn parse_line(r: &Regex, line: &str) -> Vec<String> {
    let mut links = vec![];
    let matches = r.find_iter(line);
    
    for m in matches {
        let m = m.as_str();
        
        // Our line could contain more than one link. Let's split at "["
        let mut pre_v: Vec<&str> = m.split("[").collect();
        // And remove everything that was before the first "["
        pre_v.remove(0);
        // Now, normally we have a vector of length 1, in which case we can 
        // just continue. Otherwise, we treat each element of the vector
        // as a separate line and call parse_line recursively.
        while pre_v.len() > 1 {
            // In order for the recursive function call to work we need to 
            // put the "[" before the element.
            let temp_line = format!("[{}", pre_v.pop().unwrap());
            let mut temp_links = parse_line(r, temp_line.as_str());
            links.append(&mut temp_links);
        }
        // Now we should be down to just one element. This, we split into a
        // vector
        let v: Vec<&str>  = pre_v[0].splitn(2, "](").collect();
        
        // However, the sequence "](" might be absent, which results in
        // a vector with still just one element.In that case, we just 
        // skip the following steps.
        
        if v.len() > 1 {
            // We need to handle a special case here. Let's check if the rest
            // contains an opening parenthesis.
            // It might be something after the filename, like `foo.md) (bar).
            // Or the filename might contain parentheses like `f(o)o.md)`.
            
            // Now, we can neither remove the last ), because it might be 
            // unrelated to the link, nor can we just remove the first, 
            // because it might be part of a pair of parentheses in the 
            // filename.
            // So we need to find the index of the first ")" that is not 
            // preceded by a "(". 
            
            // OK, so we split at ")", iterate over the parts and check whether
            // the respective part contains a "(". If not, we have found the 
            // place where we want to split our rest string. To calculate 
            // the index of the place, we add the number of preceding 
            // characters.
            let temp_v = v[1].split(')');
            let mut i = 0;
            for part in temp_v {
                i += part.len();
                if part.contains('(') {
                    i += 1; // for the split ")", since we'll do another round
                } else {
                    break;
                }
            }
            
            // Now we have found the index of the ")" which closes our hyperlink
            // So that's where we split. Everything befor the split is our
            // link.
            let (link, _) = v[1].split_at(i);
            links.push(link.to_string());
        }
    }
    links
}

/// In order to find the targets a zettel links to, the markdown links of
/// said zettel need to be parsed. That's what this function is for.
/// It takes a regex pattern that matches markdown links, and a list of files 
/// to be parsed as well as the rootdir to the Zettelkasten.
/// Note: The list of files contains paths relative to rootdir!
/// Furthermore, it takes a reference to the rootdir as a point of reference.
/// It returns a list of tuples of `PathBuf`s. The first PathBuf references
/// the file in question, the second the target of a link found in that file.
/// Both paths are relative to `rootdir`.
///
/// Throws an error if a file or rootdir doesn't exist and propagates any
/// errors that `grep` might return.
/// # Errors
/// - [`Error::BadLink`](enum.Error.html#variant.BadLink) if one of the files 
///   links to a target that doesn't exist.
/// - [`Error::Io`](enum.Error.html#variant.Io) wrapping several kinds of 
///   `std::io:Error`, e.g. problems executing grep, problems with the files etc.
/// - [`Error::NormalizePath`](enum.Error.html#variant.NormalizePath) if the 
///   path of one of the files contained in output can not be expressed 
///   relative to the root directory.
pub fn parse_files<P: AsRef<Path>>(rootdir: P, pattern: &str, mut files: Vec<PathBuf>)
    -> Result<Vec<(PathBuf, PathBuf)>, Error> {
    let mut collected_links = vec![];
    let r = Regex::new(pattern)
            .expect("Failed to build regex.");
    
    let rootdir = rootdir.as_ref();
    
    for file in files.drain(..) {
        let file = rootdir.join(file).canonicalize()?;
        trace!("Working on links from {:?}", file);
        let f = std::fs::File::open(&file)?;
        let reader = BufReader::new(f);
        for line in reader.lines() {
            let line = line?; // unrwap each result and propagate errors
            // now, let's look for matches to our regex pattern
            let matches = parse_line(&r, &line);

            for target in matches {
                    
                // Now we have our target. But we don't want external links.
                if !target.starts_with("http") {
                    let target = normalize_link(rootdir, 
                                                Path::new(&file), 
                                                Path::new(&target))?; //error::Error
                    let file = normalize_path(Path::new(rootdir), 
                                                Path::new(&file))?; //error::Error
                    // Put the tuple into the container
                    collected_links.push((file, target));
                }
            } 
        }
    }
    Ok(collected_links)
}


// --------------------------------------------------------------------------
#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    extern crate tempfile;

    #[test]
    fn test_parse_line() {
        let r = Regex::new(PATTERN)
            .expect("Failed to build regex.");
    
        let text = "Duis [ornare](enim) magna";
        let links = parse_line(&r, text);
        assert_eq!(links[0], "enim");
        
        let text = "Duis [ornare](enim) [magna](foo)";
        let links = parse_line(&r, text);
        assert_eq!(links.len(), 2);
        assert!(links.contains(&String::from("enim")));
        assert!(links.contains(&String::from("foo")));
        
        let text = "Integer consectetur neque velit, at.";
        let link = parse_line(&r, text);
        assert!(link.is_empty());
    }
    
    #[test]
    fn test_parse_files() {
        use std::io::Write;
        let tmp_dir = tempfile::tempdir()
            .expect("Failed to create tempdir.");
        fs::create_dir_all(tmp_dir.path().join("subdir"))
            .expect("Failed to create directory structure.");
        let filepath = tmp_dir.path().join("subdir/file.md");
        let mut file = std::fs::File::create(&filepath)
            .expect("Failed to create file.");
        let f2path = tmp_dir.path().join("file2.md");
        let f3path = tmp_dir.path().join("subdir/file3.md");
        std::fs::File::create(&f2path)
            .expect("Failed to create file2.");
        std::fs::File::create(&f3path)
            .expect("Failed to create file3.");
        write!(file, "Duis [ornare](../file2.md) magna
consectetur neque velit, at.
Duis [ornare](../file2.md) [magna](file3.md)")
        .expect("Failed to write to file");
        let links = parse_files(&tmp_dir.path().to_path_buf(), 
                                PATTERN, 
                                vec![filepath.to_path_buf()]);
        assert!(links.is_ok());
        let links = links.unwrap();
        assert_eq!(links.len(), 3);
        assert!(links.contains( &(
                                  PathBuf::from("subdir/file.md"), 
                                  PathBuf::from("file2.md") 
                                  )
                                ));
        assert!(links.contains( &(
                                  PathBuf::from("subdir/file.md"),
                                  PathBuf::from("subdir/file3.md"),
                                  ) 
                                ));
    }
}