biblib 0.4.3 - Docs.rs

//! RIS format parsing implementation.
//!
//! This module handles the low-level parsing of RIS formatted text.

use crate::ris::structure::RawRisData;
use crate::ris::tags::RisTag;
use crate::utils::parse_author_name;
use crate::{
    Author, CitationFormat,
    error::{ParseError, SourceSpan, ValueError},
};

/// Parse the content of a RIS formatted file, returning structured data.
pub(crate) fn ris_parse<S: AsRef<str>>(ris_text: S) -> Result<Vec<RawRisData>, ParseError> {
    let text = ris_text.as_ref();

    if text.trim().is_empty() {
        return Ok(Vec::new());
    }

    let mut citations = Vec::new();
    let mut current_citation = RawRisData::new();
    let mut line_number = 0;
    let text_ptr = text.as_ptr() as usize;
    // Track the last successfully parsed non-author tag for continuation line appending.
    let mut last_tag: Option<RisTag> = None;

    for raw_line in text.lines() {
        line_number += 1;
        // Compute byte start of this line via pointer arithmetic.
        let line_byte_start = raw_line.as_ptr() as usize - text_ptr;

        // Skip empty lines (but don't trim yet — we need the raw indentation to detect continuations)
        if raw_line.trim().is_empty() {
            continue;
        }

        let line_byte_end = line_byte_start + raw_line.len();

        // Detect continuation lines: a line that does NOT start with a 2-char alphanum tag
        // followed by whitespace or a dash. These are bare continuation lines that belong to
        // the previous tag (e.g. an N2/AB abstract that wraps without indentation).
        if is_continuation_line(raw_line) {
            if let Some(ref tag) = last_tag {
                if let Some(ref mut span) = current_citation.record_span {
                    span.end = line_byte_end;
                }
                // Append continuation text to the last value of the tag.
                if let Some(values) = current_citation.data.get_mut(tag)
                    && let Some(last_val) = values.last_mut()
                {
                    last_val.push(' ');
                    last_val.push_str(raw_line.trim());
                }
            } else {
                // No prior tag to attach to — treat as ignored
                current_citation.add_ignored_line(line_number, raw_line.trim().to_string());
            }
            continue;
        }

        let line = raw_line.trim();

        // Skip metadata lines
        if is_metadata_line(line) {
            continue;
        }

        match parse_ris_line(line, line_number) {
            Ok((tag, content)) => {
                match tag {
                    RisTag::Type => {
                        // Start of new citation
                        if current_citation.has_content() {
                            citations.push(current_citation);
                            current_citation = RawRisData::new();
                        }
                        last_tag = None;
                        current_citation.start_line = Some(line_number);
                        current_citation.record_span =
                            Some(SourceSpan::new(line_byte_start, line_byte_end));
                        current_citation.add_data(tag, content);
                    }
                    RisTag::EndOfReference => {
                        // Extend span to cover the ER line
                        if let Some(ref mut span) = current_citation.record_span {
                            span.end = line_byte_end;
                        }
                        last_tag = None;
                        // End of current citation
                        if current_citation.has_content() {
                            citations.push(current_citation);
                            current_citation = RawRisData::new();
                        }
                    }
                    tag if tag.is_author_tag() => {
                        if let Some(ref mut span) = current_citation.record_span {
                            span.end = line_byte_end;
                        }
                        last_tag = None;
                        let authors = split_and_parse_authors(&content);
                        for author in authors {
                            current_citation.add_author(author);
                        }
                    }
                    _ => {
                        if let Some(ref mut span) = current_citation.record_span {
                            span.end = line_byte_end;
                        }
                        last_tag = Some(tag.clone());
                        current_citation.add_data(tag, content);
                    }
                }
            }
            Err(_) => {
                if let Some(ref mut span) = current_citation.record_span {
                    span.end = line_byte_end;
                }
                last_tag = None;
                // Add invalid lines to ignored lines with context
                current_citation.add_ignored_line(line_number, line.to_string());
            }
        }
    }

    // Add the last citation if it has content
    if current_citation.has_content() {
        citations.push(current_citation);
    }

    if citations.is_empty() {
        return Ok(Vec::new());
    }

    Ok(citations)
}

/// Parse a single RIS line into a tag and content.
fn parse_ris_line(line: &str, line_number: usize) -> Result<(RisTag, String), ParseError> {
    // Validate minimum line length
    if line.len() < 2 {
        return Err(ParseError::at_line(
            line_number,
            CitationFormat::Ris,
            ValueError::Syntax(format!(
                "Line too short for RIS format (minimum 2 chars): '{}'",
                line
            )),
        ));
    }

    let tag_str = &line[..2];

    // Validate tag format
    if !tag_str.chars().all(|c| c.is_ascii_alphanumeric()) {
        return Err(ParseError::at_line(
            line_number,
            CitationFormat::Ris,
            ValueError::Syntax(format!("Invalid RIS tag format: '{}'", tag_str)),
        ));
    }

    let tag = RisTag::from_tag(tag_str);

    // Extract content
    let content = extract_ris_content(line, line_number)?;

    Ok((tag, content))
}

/// Extract content from a RIS line, handling various format patterns.
fn extract_ris_content(line: &str, line_number: usize) -> Result<String, ParseError> {
    // Standard format: "TY  - JOUR"
    if line.len() >= 6 && &line[2..6] == "  - " {
        return Ok(line[6..].trim().to_string());
    }

    // Format without space after dash: "ER  -"
    if line.len() >= 5 && &line[2..5] == "  -" {
        return Ok(line[5..].trim().to_string());
    }

    // Format without spaces before dash: "TY- JOUR"
    if line.len() >= 4 && &line[2..4] == "- " {
        return Ok(line[4..].trim().to_string());
    }

    // Minimal format: "TY-JOUR"
    if line.len() >= 3 && &line[2..3] == "-" {
        return Ok(line[3..].trim().to_string());
    }

    // Require proper separator (space or dash) after tag
    if line.len() > 2 {
        let third_char = line.chars().nth(2).unwrap();
        if third_char == ' ' || third_char == '-' {
            return Ok(line[2..].trim().to_string());
        }
    }

    // If we reach here, the line doesn't have a proper separator
    Err(ParseError::at_line(
        line_number,
        CitationFormat::Ris,
        ValueError::Syntax(format!(
            "RIS line missing proper separator (space or dash) after tag: '{}'",
            line
        )),
    ))
}

/// Split an author string into multiple authors and parse each one.
///
/// Handles non-standard RIS files where multiple authors are on a single AU line.
/// Splits on:
/// - Semicolons (`;`) - primary separator
/// - ` & ` and ` and ` - secondary separators (with surrounding spaces)
///
/// Does NOT split on bare commas since "Last, First" format uses commas.
fn split_and_parse_authors(author_str: &str) -> Vec<Author> {
    let trimmed = author_str.trim();
    if trimmed.is_empty() {
        return Vec::new();
    }

    // First split on semicolons (primary separator)
    let segments: Vec<&str> = trimmed.split(';').collect();

    let mut authors = Vec::new();

    for segment in segments {
        let segment = segment.trim();
        if segment.is_empty() {
            continue;
        }

        // Then split on " & " and " and " (secondary separators)
        let sub_segments: Vec<&str> = segment
            .split(" & ")
            .flat_map(|s| s.split(" and "))
            .collect();

        for sub in sub_segments {
            let sub = sub.trim();
            if !sub.is_empty() {
                authors.push(parse_author(sub));
            }
        }
    }

    // If no splits occurred, parse as single author
    if authors.is_empty() {
        authors.push(parse_author(trimmed));
    }

    authors
}

/// Parse an author string into an Author struct.
fn parse_author(author_str: &str) -> Author {
    let (family, given) = parse_author_name(author_str);
    let (given_opt, middle_opt) = if given.is_empty() {
        (None, None)
    } else {
        crate::utils::split_given_and_middle(&given)
    };
    Author {
        name: family,
        given_name: given_opt,
        middle_name: middle_opt,
        affiliations: Vec::new(),
    }
}

/// Returns true if the line is a continuation of a previous tag value rather than a new tag.
///
/// A well-formed RIS tag line looks like one of:
///   "TY  - JOUR"   (standard: 2-char tag + "  - ")
///   "TY- JOUR"     (no space before dash)
///   "TY-JOUR"      (minimal)
///   "ER  -"        (end of reference)
///
/// A line whose first two characters are alphanumeric but whose remainder does NOT match any
/// separator pattern is a bare continuation line belonging to the previous tag.
fn is_continuation_line(line: &str) -> bool {
    let bytes = line.as_bytes();
    if bytes.len() < 2 {
        return true;
    }
    // First two chars must be ASCII alphanumeric for it to even look like a tag
    if !bytes[0].is_ascii_alphanumeric() || !bytes[1].is_ascii_alphanumeric() {
        return false; // invalid tag chars — let parse_ris_line produce an error
    }
    // Check for the known separator patterns at position 2+
    if bytes.len() >= 6 && &bytes[2..6] == b"  - " {
        return false; // "TY  - ..." — proper tag
    }
    if bytes.len() >= 5 && &bytes[2..5] == b"  -" {
        return false; // "ER  -" — proper tag (no trailing content)
    }
    if bytes.len() >= 4 && &bytes[2..4] == b"- " {
        return false; // "TY- JOUR"
    }
    if bytes.len() >= 3 && bytes[2] == b'-' {
        return false; // "TY-JOUR" minimal
    }
    // Two alphanumeric chars followed by a space but no dash — continuation line
    // e.g. "At present, there are no..." where "At" looks like a tag but has no dash
    true
}

/// Check if a line is RIS metadata that should be ignored.
fn is_metadata_line(line: &str) -> bool {
    line.starts_with("Record #")
        || line.starts_with("Provider:")
        || line.starts_with("Content:")
        || line.starts_with("Database:")
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::*;

    #[rstest]
    #[case("TY  - JOUR", RisTag::Type, "JOUR")]
    #[case("TI  - Test Title", RisTag::Title, "Test Title")]
    #[case("AU  - Smith, John", RisTag::Author, "Smith, John")]
    #[case("ER  -", RisTag::EndOfReference, "")]
    #[case("DO  - 10.1000/test", RisTag::Doi, "10.1000/test")]
    #[case("TY Content", RisTag::Type, "Content")]
    #[case("TY-Content", RisTag::Type, "Content")]
    fn test_parse_ris_line_valid(
        #[case] line: &str,
        #[case] expected_tag: RisTag,
        #[case] expected_content: &str,
    ) {
        let result = parse_ris_line(line, 1).unwrap();
        assert_eq!(result.0, expected_tag);
        assert_eq!(result.1, expected_content);
    }

    #[rstest]
    #[case("")]
    #[case("A")]
    #[case("!!  - Invalid tag")]
    #[case("TYNoSeparator")]
    #[case("TYBAD")]
    fn test_parse_ris_line_invalid(#[case] line: &str) {
        let result = parse_ris_line(line, 1);
        assert!(result.is_err());
    }

    #[rstest]
    #[case("Record #1 of 10", true)]
    #[case("Provider: Some Provider", true)]
    #[case("Content: text/plain", true)]
    #[case("Database: PubMed", true)]
    #[case("TY  - JOUR", false)]
    fn test_is_metadata_line(#[case] line: &str, #[case] expected: bool) {
        assert_eq!(is_metadata_line(line), expected);
    }

    #[test]
    fn test_parse_simple_citation() {
        let input = r#"TY  - JOUR
TI  - Test Article
AU  - Smith, John
ER  -"#;

        let result = ris_parse(input).unwrap();
        assert_eq!(result.len(), 1);

        let raw = &result[0];
        assert_eq!(raw.get_first(&RisTag::Type), Some(&"JOUR".to_string()));
        assert_eq!(
            raw.get_first(&RisTag::Title),
            Some(&"Test Article".to_string())
        );
        assert_eq!(raw.authors.len(), 1);
        assert_eq!(raw.authors[0].name, "Smith");
    }

    #[test]
    fn test_parse_multiple_citations() {
        let input = r#"TY  - JOUR
TI  - First Article
AU  - Smith, John
ER  -

TY  - BOOK
TI  - Second Article
AU  - Doe, Jane
ER  -"#;

        let result = ris_parse(input).unwrap();
        assert_eq!(result.len(), 2);

        assert_eq!(
            result[0].get_first(&RisTag::Type),
            Some(&"JOUR".to_string())
        );
        assert_eq!(
            result[1].get_first(&RisTag::Type),
            Some(&"BOOK".to_string())
        );
    }

    #[test]
    fn test_parse_with_metadata() {
        let input = r#"Record #1 of 2
Provider: Test Provider
Database: Test DB

TY  - JOUR
TI  - Test Article
AU  - Smith, John
ER  -"#;

        let result = ris_parse(input).unwrap();
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0].get_first(&RisTag::Title),
            Some(&"Test Article".to_string())
        );
    }

    #[test]
    fn test_parse_empty_input() {
        let result = ris_parse("").unwrap();
        assert!(result.is_empty());
    }

    #[test]
    fn test_parse_no_valid_citations() {
        let input = r#"Record #1 of 0
Provider: Test Provider"#;

        let result = ris_parse(input).unwrap();
        assert!(result.is_empty());
    }

    #[test]
    fn test_parse_with_invalid_lines() {
        let input = r#"TY  - JOUR
TI  - Test Article
!! - This is truly invalid
AU  - Smith, John
ER  -"#;

        let result = ris_parse(input).unwrap();
        assert_eq!(result.len(), 1);
        assert_eq!(result[0].ignored_lines.len(), 1);
        assert!(result[0].ignored_lines[0].1.contains("!!"));
    }

    #[test]
    fn test_parse_author() {
        let author = parse_author("Smith, John");
        assert_eq!(author.name, "Smith");
        assert_eq!(author.given_name.as_deref(), Some("John"));
        assert!(author.affiliations.is_empty());
    }

    #[test]
    fn test_split_authors_single() {
        let authors = split_and_parse_authors("Smith, John");
        assert_eq!(authors.len(), 1);
        assert_eq!(authors[0].name, "Smith");
    }

    #[test]
    fn test_split_authors_semicolon() {
        let authors = split_and_parse_authors("Smith, J.; Doe, A.; Brown, B.");
        assert_eq!(authors.len(), 3);
        assert_eq!(authors[0].name, "Smith");
        assert_eq!(authors[1].name, "Doe");
        assert_eq!(authors[2].name, "Brown");
    }

    #[test]
    fn test_split_authors_ampersand() {
        let authors = split_and_parse_authors("Smith, J. & Doe, A.");
        assert_eq!(authors.len(), 2);
        assert_eq!(authors[0].name, "Smith");
        assert_eq!(authors[1].name, "Doe");
    }

    #[test]
    fn test_split_authors_and() {
        let authors = split_and_parse_authors("Smith, J. and Doe, A.");
        assert_eq!(authors.len(), 2);
        assert_eq!(authors[0].name, "Smith");
        assert_eq!(authors[1].name, "Doe");
    }

    #[test]
    fn test_split_authors_mixed() {
        let authors = split_and_parse_authors("Smith, J.; Doe, A. & Brown, B.");
        assert_eq!(authors.len(), 3);
        assert_eq!(authors[0].name, "Smith");
        assert_eq!(authors[1].name, "Doe");
        assert_eq!(authors[2].name, "Brown");
    }

    #[test]
    fn test_split_authors_reported_issue() {
        // The reported issue: "Abebe, T., Alemu, B., & Teshome, M"
        // We split on " & " so get 2 authors (commas don't split)
        let authors = split_and_parse_authors("Abebe, T., Alemu, B., & Teshome, M");
        assert_eq!(authors.len(), 2);
        assert_eq!(authors[0].name, "Abebe");
        assert_eq!(authors[1].name, "Teshome");
    }

    #[test]
    fn test_split_authors_empty() {
        let authors = split_and_parse_authors("");
        assert!(authors.is_empty());
    }

    #[test]
    fn test_n2_continuation_line_without_leading_space() {
        // Real-world case: N2 abstract where the second line has no leading space/indent.
        let input = "TY  - JOUR\nTI  - Test\nN2  - Brief Summary\nAt present, there are no relevant studies.\nER  -\n";
        let result = ris_parse(input).unwrap();
        assert_eq!(result.len(), 1);
        let abstract_val = result[0].get_first(&RisTag::AbstractAlternative).unwrap();
        assert!(
            abstract_val.contains("At present"),
            "continuation line should be appended to N2: got '{}'",
            abstract_val
        );
        assert_eq!(
            abstract_val,
            "Brief Summary At present, there are no relevant studies."
        );
    }

    #[test]
    fn test_ab_continuation_line_without_leading_space() {
        let input = "TY  - JOUR\nTI  - Test\nAB  - First sentence.\nSecond sentence continues here.\nER  -\n";
        let result = ris_parse(input).unwrap();
        assert_eq!(result.len(), 1);
        let abstract_val = result[0].get_first(&RisTag::Abstract).unwrap();
        assert!(abstract_val.contains("Second sentence continues here."));
    }
}