repodb_parser 0.3.1

Parser for Arch Linux repository DB's
Documentation
// SPDX-FileCopyrightText: 2022-2024 Michael Picht <mipi@fsfe.org>
//
// SPDX-License-Identifier: GPL-3.0-or-later

use crate::{
    dep::{Dep, OptDep},
    pkg::Pkg,
};
use anyhow::{anyhow, Context};
use chrono::DateTime;
use std::{
    collections::BTreeMap,
    io::BufRead,
    str::{self, FromStr},
};
use url::Url;

/// Names of key fields of package meta data in repository DB
const KEY_NAME: &str = "%NAME%";
const KEY_FILE_NAME: &str = "%FILENAME%";
const KEY_BASE: &str = "%BASE%";
const KEY_VERSION: &str = "%VERSION%";
const KEY_DESC: &str = "%DESC%";
const KEY_GROUPS: &str = "%GROUPS%";
const KEY_C_SIZE: &str = "%CSIZE%";
const KEY_I_SIZE: &str = "%ISIZE%";
const KEY_MD5_SUM: &str = "%MD5SUM%";
const KEY_SHA256_SUM: &str = "%SHA256SUM%";
const KEY_PGP_SIG: &str = "%PGPSIG%";
const KEY_URL: &str = "%URL%";
const KEY_LICENSE: &str = "%LICENSE%";
const KEY_ARCH: &str = "%ARCH%";
const KEY_BUILD_DATE: &str = "%BUILDDATE%";
const KEY_PACKAGER: &str = "%PACKAGER%";
const KEY_REPLACES: &str = "%REPLACES%";
const KEY_CONFLICTS: &str = "%CONFLICTS%";
const KEY_PROVIDES: &str = "%PROVIDES%";
const KEY_DEPS: &str = "%DEPENDS%";
const KEY_OPT_DEPS: &str = "%OPTDEPENDS%";
const KEY_CHECK_DEPS: &str = "%CHECKDEPENDS%";
const KEY_MAKE_DEPS: &str = "%MAKEDEPENDS%";

///
/// Helper macros for parsing of package meta data from repository DB
///
/// Check if there exactly one value for a key
macro_rules! check_exactly_one {
    ($vals:expr , $pkg_name:expr , $field_name:expr) => {
        match $vals.len() {
            0 => Err(anyhow!(format!(
                "package {} has no value for key '{}'",
                $pkg_name, $field_name
            ))),
            1 => Ok(()),
            _ => Err(anyhow!(format!(
                "package {} has more than one value assigned to key '{}'",
                $pkg_name, $field_name
            ))),
        }
    };
}

/// Check if there is none or one value for a key
macro_rules! check_zero_or_one {
    ($vals:expr , $pkg_name:expr , $field_name:expr) => {
        if $vals.len() <= 1 {
            Ok(())
        } else {
            Err(anyhow!(format!(
                "package {} has more than one value assigned to key '{}'",
                $pkg_name, $field_name
            )))
        }
    };
}

macro_rules! parse_datetime {
    ($src:expr , $pkg_name:expr , $field_name:expr) => {
        match $src[0].parse::<usize>() {
            Ok(timestamp) => match DateTime::from_timestamp(timestamp as i64, 0) {
                Some(t) => Ok(t),
                None => Err(anyhow!(
                    "{} of package {} is out of range",
                    &$field_name,
                    $pkg_name
                )),
            },
            Err(err) => Err(err).with_context(|| {
                format!(
                    "{} of package {} is not a valid Unix timestamp",
                    &$field_name, $pkg_name
                )
            }),
        }
    };
}

macro_rules! parse_dependencies {
    ($src:expr , $dst:expr , $pkg_name:expr , $field_name:expr) => {
        for val in $src {
            $dst.push(Dep::from_str(&val).with_context(|| {
                format!(
                    "{} of package {} contains invalid dependencies",
                    &$field_name, $pkg_name
                )
            })?)
        }
    };
}

macro_rules! parse_opt_dependencies {
    ($src:expr , $dst:expr , $pkg_name:expr , $field_name:expr) => {
        for val in $src {
            $dst.push(OptDep::from_str(&val).with_context(|| {
                format!(
                    "{} of package {} contains invalid dependencies",
                    &$field_name, $pkg_name
                )
            })?)
        }
    };
}

macro_rules! parse_hex_string {
    ($src:expr , $pkg_name:expr , $field_name:expr) => {
        hex::decode(&$src[0]).with_context(|| {
            format!(
                "{} of package {} is not a valid hex string",
                &$field_name, $pkg_name
            )
        })
    };
}

macro_rules! parse_name {
    ($src:expr) => {
        if let Some(vals) = $src {
            if vals.is_empty() {
                Err(anyhow!(format!("package has no name")))
            } else if vals.len() > 1 {
                Err(anyhow!(format!("package has more than one name")))
            } else {
                Ok(vals[0].clone())
            }
        } else {
            Err(anyhow!(format!("package has no name")))
        }
    };
}

macro_rules! parse_number {
    ($src:expr , $pkg_name:expr , $field_name:expr) => {
        $src[0]
            .parse::<usize>()
            .with_context(|| format!("{} of package {} is not a number", &$field_name, $pkg_name))
    };
}

macro_rules! parse_string {
    ($src:expr) => {
        $src.pop().unwrap()
    };
}

macro_rules! parse_string_option {
    ($src:expr) => {
        if !$src.is_empty() {
            Some($src.pop().unwrap())
        } else {
            None
        }
    };
}

macro_rules! parse_string_array {
    ($src:expr , $dst:expr) => {
        if !$src.is_empty() {
            $dst = vec![];
            $dst.extend($src);
        }
    };
}

macro_rules! parse_url {
    ($src:expr , $dst:expr , $pkg_name:expr , $field_name:expr) => {
        if !$src.is_empty() {
            $dst = Some(Url::parse(&$src[0]).with_context(|| {
                format!(
                    "{} of package {} is not a valid URL",
                    &$field_name, $pkg_name
                )
            })?);
        }
    };
}

/// Retrieve meta data of a package from reader and return it as Pkg structure
pub fn parse_pkg<R: BufRead>(reader: R) -> anyhow::Result<Pkg> {
    // Get package content from lexer
    let cnt = lexer(reader)?;

    // Initialize package data structure. Retrieve package name already here
    // since the name is needed in the for loop below for error messages
    let mut pkg = Pkg {
        name: parse_name!(cnt.get(KEY_NAME))?,
        ..Default::default()
    };

    // Fill package data from lexer result
    for (key, mut vals) in cnt {
        match &*key {
            KEY_NAME => {
                continue; // nothing to do since package name was already retrieved
            }
            KEY_FILE_NAME => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.file_name = parse_string!(vals);
            }
            KEY_BASE => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.base = parse_string!(vals);
            }
            KEY_VERSION => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.version = parse_string!(vals);
            }
            KEY_DESC => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.desc = parse_string!(vals);
            }
            KEY_GROUPS => parse_string_array!(vals, pkg.groups),
            KEY_C_SIZE => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.c_size = parse_number!(vals, pkg.name, key)?;
            }
            KEY_I_SIZE => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.i_size = parse_number!(vals, pkg.name, key)?;
            }
            KEY_MD5_SUM => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.md5_sum = parse_hex_string!(vals, pkg.name, key)?;
            }
            KEY_SHA256_SUM => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.sha256_sum = parse_hex_string!(vals, pkg.name, key)?;
            }
            KEY_PGP_SIG => {
                check_zero_or_one!(vals, pkg.name, &key)?;
                pkg.pgp_sig = parse_string_option!(vals);
            }
            KEY_URL => {
                check_zero_or_one!(vals, pkg.name, &key)?;
                parse_url!(vals, pkg.url, pkg.name, key);
            }
            KEY_LICENSE => parse_string_array!(vals, pkg.license),
            KEY_ARCH => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.arch = parse_string!(vals);
            }
            KEY_BUILD_DATE => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.build_date = parse_datetime!(vals, pkg.name, key)?;
            }
            KEY_PACKAGER => {
                check_exactly_one!(vals, pkg.name, &key)?;
                pkg.packager = parse_string!(vals);
            }
            KEY_REPLACES => parse_string_array!(vals, pkg.replaces),
            KEY_CONFLICTS => parse_string_array!(vals, pkg.conflicts),
            KEY_PROVIDES => parse_string_array!(vals, pkg.provides),
            KEY_DEPS => parse_dependencies!(vals, pkg.deps, pkg.name, key),
            KEY_OPT_DEPS => parse_opt_dependencies!(vals, pkg.opt_deps, pkg.name, key),
            KEY_CHECK_DEPS => parse_dependencies!(vals, pkg.check_deps, pkg.name, key),
            KEY_MAKE_DEPS => parse_dependencies!(vals, pkg.make_deps, pkg.name, key),
            _ => {
                return Err(anyhow!(format!(
                    "package {} contains unkown key {}",
                    pkg.name, &key
                )));
            }
        }
    }

    pkg.groups.sort();
    pkg.license.sort();
    pkg.deps.sort();
    pkg.replaces.sort();
    pkg.conflicts.sort();
    pkg.provides.sort();
    pkg.deps.sort();
    pkg.opt_deps.sort();
    pkg.check_deps.sort();
    pkg.make_deps.sort();

    Ok(pkg)
}

/// Result data structure of lexer
type PkgContent = BTreeMap<String, Vec<String>>;

/// Tokens
enum Token {
    SectionKey(String),
    SectionValue(String),
}
impl FromStr for Token {
    type Err = anyhow::Error;

    fn from_str(line: &str) -> anyhow::Result<Self, Self::Err> {
        if line.is_empty() {
            return Err(anyhow!("cannot create token from empty string"));
        }
        if line.starts_with('%') && line.ends_with('%') {
            return Ok(Token::SectionKey(line.to_string()));
        }
        Ok(Token::SectionValue(line.to_string()))
    }
}

/// Splits package meta data from reader in tokens and returns is as binary
/// tree map of string arrays
fn lexer<R: BufRead>(reader: R) -> anyhow::Result<PkgContent> {
    // Build content map from lexer
    let mut cnt = PkgContent::new();
    let mut sec_key: String = Default::default();
    for token in reader
        .lines()
        .map(|line| line.unwrap().trim().to_string())
        .filter_map(|line| {
            if line.is_empty() {
                None
            } else {
                Some(Token::from_str(&line))
            }
        })
    {
        match token? {
            Token::SectionKey(key) => {
                sec_key = key;
            }
            Token::SectionValue(val) => {
                std::collections::btree_map::Entry::or_insert(
                    cnt.entry(sec_key.to_string())
                        .and_modify(|vals| vals.push(val.clone())),
                    vec![val.clone()],
                );
            }
        };
    }

    Ok(cnt)
}