libzettels 0.4.1

A library intended as a backend for applications which implement Niklas Luhmann's system of a 'Zettelkasten'.
Documentation
//Copyright (c) 2020-2022 Stefan Thesing
//
//This file is part of libzettels.
//
//libzettels is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//libzettels is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with Zettels. If not, see http://www.gnu.org/licenses/.

//! Auxiliary module for building and updating the index using grep. 
//! To be more specific, this is used to parse markdown links (the inline
//! variant, only) in zettel files to extract the target of these links.
//! This module is used when the corresponding field of the [Config](struct.Config.html)  
//! is set to `IndexingMethod::Grep`.
// --------------------------------------------------------------------------

// External imports
use std::process::Command;

// Internal imports
use super::*;

// --------------------------------------------------------------------------
// Functions
// --------------------------------------------------------------------------

/// In order to find the targets a zettel links to, the markdown links of
/// said zettel need to be parsed. That's what this function is for.
/// It takes a grep-compatible regex pattern as a `&str`, and a list of files
/// to be passed as arguments to grep.
/// Note: The list of files contains paths relative to rootdir!
/// Furthermore, it takes a reference to the rootdir as a point of reference.
/// It returns a list of tuples of `PathBuf`s. The first PathBuf references
/// the file in question, the second the target of a link found in that file.
/// Both paths are relative to `rootdir`.
///
/// Throws an error if a file or rootdir doesn't exist and propagates any
/// errors that `grep` might return.
/// # Errors
/// - [`Error::BadLink`](enum.Error.html#variant.BadLink) if one of the files 
///   links to a target that doesn't exist.
/// - [`Error::Io`](enum.Error.html#variant.Io) wrapping several kinds of 
///   `std::io:Error`, e.g. problems executing grep, problems with the files etc.
/// - [`Error::NormalizePath`](enum.Error.html#variant.NormalizePath) if the 
///   path of one of the files contained in output can not be expressed 
///   relative to the root directory.
pub fn parse_files<P: AsRef<Path>>(rootdir: P, pattern: &str, mut files: Vec<PathBuf>)
    -> Result<Vec<(PathBuf, PathBuf)>, Error> {
    let rootdir = rootdir.as_ref();
    let mut os_files = vec![];
    for file in files.drain(..) {
        let can_file = rootdir.join(file).canonicalize()?; //std::io::error
        let can_file = can_file.into_os_string();
        trace!("Working on links from {:?}", can_file);
        os_files.push(can_file);
    }
    
    // If we have no files to parse, we can cut this short. Empty result.
    // For Ripgrep, this fixes Issue #1, for Grep, it just speeds things up.
    if os_files.is_empty() {
        return Ok(vec![]);
    }    
    
    // make rootdir absolute
    let rootdir = &rootdir.canonicalize()?;                //std::io::error
    let output = Command::new("grep")      // run grep
                            .arg("-E")
                            .arg("-o")
                            .arg("--with-filename")
                            .arg(pattern)
                            .args(os_files)
                            .output()?;                   //std::io::error  
    let output = String::from_utf8_lossy(&output.stdout);
    
    process_output(&rootdir, output)     //error::Error
}

/// Called by `grep_files` or `grep_rootdir`. Converts the output of grep 
/// from something like this:
/// ```shell, no_run
/// /home/user/Zettelkasten/file1.md:[some text](file2.md)
/// /home/user/Zettelkasten/subdir/file5.md:[more text](../file1.md)
/// ```
/// to something like this:
/// ```rust, no_run
/// # use std::path::Path;
/// vec![
///   //source                      //target  
/// ( Path::new("file1.md"),        Path::new("file2.md") ),
/// ( Path::new("subdir/file5.md"), Path::new("file1.md") ),
/// ];
/// ```
/// Note that the target of the second processed link is no longer relative 
/// to it's source (`../file1.md`) but relative to the root directory 
/// (`file1.md`).
/// # Errors
/// - [`Error::BadLink`](enum.Error.html#variant.BadLink) if one of the files 
///   links to a target that doesn't exist.
/// - [`Error::Io`](enum.Error.html#variant.Io) wrapping several kinds of 
///   `std::io:Error`.
/// - [`Error::NormalizePath`](enum.Error.html#variant.NormalizePath) if the 
///   path of one of the files contained in output can not be expressed 
///   relative to the root directory.
fn process_output<P: AsRef<Path>, T: AsRef<str>>(rootdir: P, output: T) 
    -> Result<Vec<(PathBuf, PathBuf)>, Error> {
    let rootdir = rootdir.as_ref();
    let output = output.as_ref();
    // Prepare the container for the return values
    let mut collected_links: Vec<(PathBuf, PathBuf)> = vec![];
    
    for line in output.lines() {
        // split off the filename
        let v: Vec<&str> = line.splitn(2, ":").collect();
        let f = v[0];   // store it here
        // discard anything before the target of the link v[0]
        let v: Vec<&str> = v[1].splitn(2, "](").collect();
        
        // We need to handle a special case here. Let's check if the rest
        // contains an opening parenthesis.
        // It might be something after the filename, like `foo.md) (bar).
        // Or the filename might contain parentheses like `f(o)o.md)`.
        
        // Now, we can neither remove the last ), because it might be unrelated
        // to the link, nor can we just remove the first, because it might be
        // part of a pair of parentheses in the filename.
        // So we need to find the index of the first ")" that is not preceded 
        // by a "(". 
        
        // OK, so we split at ")", iterate over the parts and check whether
        // the respective part contains a "(". If not, we have found the 
        // place where we want to split our rest string. To calculate 
        // the index of the place, we add the number of preceding characters.
        let temp_v = v[1].split(')');
        let mut i = 0;
        for part in temp_v {
            i += part.len();
            if part.contains('(') {
                i += 1; // for the split ")", since we'll do another round
            } else {
                break;
            }
        }
        
        // Now we have found the index of the ")" which closes our hyperlink
        // So that's where we split. Everything befor the split is our
        // link.
        let (target, _) = v[1].split_at(i);
        
        // Now we have our target. But we don't want external links.
        if !target.starts_with("http") {
            let target = normalize_link(rootdir, 
                                        Path::new(f), 
                                        Path::new(target))?; //error::Error
            let f = normalize_path(Path::new(rootdir), 
                                   Path::new(f))?; //error::Error
            // Put the tuple into the container
            collected_links.push((f, target));
        }
    }
    Ok(collected_links)
}

// --------------------------------------------------------------------------
#[cfg(test)]
mod tests {
    extern crate tempfile;
    use self::tempfile::tempdir;
    use super::*;
    use examples::*;
    
    // ----------------------------------------------------------------------
    // valid data
    // ----------------------------------------------------------------------
    
    #[test]
    fn test_process_grep_output() {
        let tmp_dir = tempdir().expect("Failed to setup temp dir");
        let dir = tmp_dir.path();
        generate_bare_examples(dir).expect("Failed to generate examples");
        let rootdir = dir.join("examples/Zettelkasten/");
        let c_root = rootdir.canonicalize()
            .expect("Failed to resolve the example directory for this test.");
        let mut output = c_root.join("subdir/file4.md").into_os_string().into_string()
            .expect("Something went wrong.");
        output.push_str(":[File 1 in the root directory](../file1.md)\n");
        output.push_str(c_root.join("subdir/file5.md").as_os_str().to_str().unwrap());
        output.push_str(":[block](../file1.md)\n");
        output.push_str(c_root.join("file2.md").as_os_str().to_str().unwrap());
        output.push_str(":[one more hyperlink](http://example.com)\n");
        output.push_str(c_root.join("file2.md").as_os_str().to_str().unwrap());
        output.push_str(":[hair](file3.md)\n");
        
        let collected_links = process_output(&rootdir, output);
        trace!("{:?}", collected_links);
        assert!(collected_links.is_ok());
        let collected_links = collected_links.unwrap();
        assert_eq!(collected_links.len(), 3);
        assert!(collected_links.contains(
            &(PathBuf::from("subdir/file4.md"), PathBuf::from("file1.md"))
            ));
        assert!(collected_links.contains(
            &(PathBuf::from("subdir/file5.md"), PathBuf::from("file1.md"))
            ));
        assert!(collected_links.contains(
            &(PathBuf::from("file2.md"), PathBuf::from("file3.md"))
            ));
    }
    
    #[test]
    fn test_grep_files() {        
        let tmp_dir = tempdir().expect("Failed to setup temp dir");
        let dir = tmp_dir.path();
        generate_bare_examples(dir).expect("Failed to generate examples");
        let rootdir = dir.join("examples/Zettelkasten/");
        
        let files = vec![PathBuf::from("subdir/file4.md"),
                             PathBuf::from("subdir/file5.md"),
                             PathBuf::from("file2.md")];
        let output = parse_files(&rootdir, PATTERN, files);
        assert!(output.is_ok());
        let output = output.unwrap();
        assert_eq!(output.len(), 3);
        assert!(output.contains(
            &(PathBuf::from("subdir/file4.md"), PathBuf::from("file1.md"))
            ));
        assert!(output.contains(
            &(PathBuf::from("subdir/file5.md"), PathBuf::from("file1.md"))
            ));
        assert!(output.contains(
            &(PathBuf::from("file2.md"), PathBuf::from("file3.md"))
            ));
    }
    
    // ----------------------------------------------------------------------
    // invalid data aka error handling
    // ----------------------------------------------------------------------
    
    #[test]
    fn test_process_grep_output_bad_link() {
        let tmp_dir = tempdir().expect("Failed to setup temp dir");
        let dir = tmp_dir.path();
        generate_bare_examples(dir).expect("Failed to generate examples");
        let rootdir = dir.join("examples/Zettelkasten/");
        let c_root = rootdir.canonicalize()
            .expect("Failed to resolve the example directory for this test.");
        let mut output = c_root.join("subdir/file4.md").into_os_string().into_string()
            .expect("Something went wrong.");
        output.push_str(":[File 1 in the root directory](../file1.md)\n");
        output.push_str(c_root.join("subdir/file5.md").as_os_str().to_str().unwrap());
        output.push_str(":[block](../file1.md)\n");
        output.push_str(c_root.join("file2.md").as_os_str().to_str().unwrap());
        output.push_str(":[one more hyperlink](http://example.com)\n");
        output.push_str(c_root.join("file2.md").as_os_str().to_str().unwrap());
        output.push_str(":[hair](foo.md)\n"); // doesn't exist
        
        let collected_links = process_output(&rootdir, output);
        trace!("{:?}", collected_links);
        assert!(collected_links.is_err());
        let e = collected_links.unwrap_err();
        match e {
            Error::BadLink(source, wrong_link, inner) => {
                assert_eq!(source, c_root.join("file2.md"));
                assert!(wrong_link.ends_with("foo.md"));
                assert_eq!(inner.kind(), std::io::ErrorKind::NotFound);
                assert!(inner.to_string().contains("No such file or directory"));
            },
            _ => panic!("Expected BadLink error, found {:#?}", e),
        }
    }
}