capitol 0.5.1

Parse United States Congress legislative document citations
Documentation
use crate::Citation;
use crate::constants::{
    COMMITTEE_DOCUMENT_SEQUENCES, MEASURE_SEQUENCES, PUBL_SEQUENCES, STATUTE_SEQUENCES,
};
use crate::error::Error;
use crate::legislation::{Chamber, CommitteeDocumentType, Congress, MeasureType};
use crate::utils::Result;
use winnow::Parser;
use winnow::Result as WResult;
use winnow::ascii::{alpha1, alphanumeric0, digit0, digit1};

#[derive(Debug, Default, PartialEq)]
struct CiteParts {
    prefix: Option<usize>,
    object: String,
    number: usize,
    suffix: Option<String>,
}

pub(crate) struct CitationParser {}

impl CitationParser {
    fn parse_prefix<'s>(input: &mut &'s str) -> WResult<&'s str> {
        digit0.parse_next(input)
    }

    fn parse_object<'s>(input: &mut &'s str) -> WResult<&'s str> {
        alpha1.parse_next(input)
    }

    fn parse_number(input: &mut &str) -> WResult<usize> {
        digit1.parse_to().parse_next(input)
    }

    fn parse_suffix<'s>(input: &mut &'s str) -> WResult<&'s str> {
        alphanumeric0.parse_next(input)
    }

    fn tokenize(input: &str) -> Result<CiteParts> {
        // initialize parts container
        let mut parts = CiteParts::default();
        let input = input.to_lowercase();

        if PUBL_SEQUENCES.iter().any(|s| input.starts_with(s)) {
            for part in input.split(' ') {
                if part.contains('-') {
                    let congress_and_number: Vec<&str> = part.split('-').collect();
                    return Ok(CiteParts {
                        prefix: Some(congress_and_number[0].parse::<usize>()?),
                        object: "publ".to_string(),
                        number: congress_and_number[1].parse::<usize>()?,
                        suffix: None,
                    });
                }
            }
        }

        let cleaned = input
            .to_lowercase()
            .replace(|c: char| !c.is_alphanumeric(), "");

        let mut input = cleaned.as_str();

        let (prefix, object, number, suffix) = (
            Self::parse_prefix,
            Self::parse_object,
            Self::parse_number,
            Self::parse_suffix,
        )
            .parse_next(&mut input)?;
        let prefix: Option<usize> = prefix.parse().ok();

        parts.prefix = prefix;
        parts.object = object.to_string();
        parts.number = number;
        parts.suffix = if suffix.is_empty() {
            None
        } else {
            Some(suffix.to_string())
        };

        Ok(parts)
    }

    pub(crate) fn parse(input: &str) -> Result<Citation> {
        let parts = Self::tokenize(input)?;

        let chamber = Chamber::parse(&parts.object);
        let document_type = parts.object.as_str();
        let congress = if let Some(num) = parts.prefix {
            Congress::parse(num, chamber.as_ref(), document_type).ok()
        } else {
            None
        };
        let number = parts.number;

        if MEASURE_SEQUENCES.contains(&document_type) {
            let version = parts.suffix;
            let measure_type =
                MeasureType::parse(document_type).ok_or(Error::UnknownLegislativeDocumentType)?;

            Ok(Citation::Measure {
                congress,
                chamber: chamber.unwrap(),
                number,
                version,
                measure_type,
            })
        } else if COMMITTEE_DOCUMENT_SEQUENCES.contains(&document_type) {
            let document_type = CommitteeDocumentType::parse(document_type)
                .ok_or(Error::UnknownLegislativeDocumentType)?;

            Ok(Citation::CommitteeDocument {
                congress,
                chamber: chamber.unwrap(),
                document_type,
                number,
            })
        } else if PUBL_SEQUENCES.contains(&document_type) {
            Ok(Citation::Law { congress, number })
        } else if STATUTE_SEQUENCES.contains(&document_type) {
            Ok(Citation::Statute {
                volume: parts.prefix.unwrap(),
                page: number,
            })
        } else {
            Err(Error::UnknownLegislativeDocumentType)
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_tokenize_no_ver_house_bill() {
        let mut input = "118hr8070";
        let expected = CiteParts {
            prefix: Some(118),
            object: String::from("hr"),
            number: 8070,
            suffix: None,
        };
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);
    }

    #[test]
    fn test_tokenize_no_ver_senate_bill() {
        let mut input = "118s5";
        let expected = CiteParts {
            prefix: Some(118),
            object: String::from("s"),
            number: 5,
            suffix: None,
        };
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);
    }

    #[test]
    fn test_tokenize_with_ver_house_bill() {
        let mut input = "118hr555ih";
        let expected = CiteParts {
            prefix: Some(118),
            object: String::from("hr"),
            number: 555,
            suffix: Some(String::from("ih")),
        };
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);
    }

    #[test]
    fn test_tokenize_with_ver_senate_bill() {
        let mut input = "118s17is";
        let expected = CiteParts {
            prefix: Some(118),
            object: String::from("s"),
            number: 17,
            suffix: Some(String::from("is")),
        };
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);
    }

    #[test]
    fn tokenize_no_congress() {
        let mut input = "hr8070";
        let expected = CiteParts {
            prefix: None,
            object: String::from("hr"),
            number: 8070,
            suffix: None,
        };
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);
    }

    #[test]
    fn tokenize_uppercase_no_congress() {
        let mut input = "HR8070";
        let expected = CiteParts {
            prefix: None,
            object: String::from("hr"),
            number: 8070,
            suffix: None,
        };
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);
    }

    #[test]
    fn tokenize_law_download_citation() {
        let mut input = "Public Law No: 119-68";
        let expected = CiteParts {
            prefix: Some(119),
            object: String::from("publ"),
            number: 68,
            suffix: None,
        };
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);

        let mut input = "Public Law No. 119-68";
        let result = CitationParser::tokenize(&mut input).unwrap();
        assert_eq!(expected, result);
    }
}