tsalign 2.1.2

A sequence-to-sequence aligner that accounts for template switches
Documentation
use anyhow::{Result, anyhow};
use compact_genome::io::fasta::FastaRecord;
use log::debug;
use std::{
    fs::File,
    io::{BufReader, Read},
    path::Path,
};
use utf8_chars::BufReadCharsExt;

pub fn parse_pair_fasta_file(
    path: impl AsRef<Path>,
) -> Result<(FastaRecord<String>, FastaRecord<String>)> {
    let mut result = parse_fasta_file(path, true)?;
    let second = result.remove(1);
    let first = result.remove(0);
    Ok((first, second))
}

pub fn parse_single_fasta_file(path: impl AsRef<Path>) -> Result<FastaRecord<String>> {
    Ok(parse_fasta_file(path, false)?.remove(0))
}

fn parse_fasta_file(path: impl AsRef<Path>, pair: bool) -> Result<Vec<FastaRecord<String>>> {
    let path = path.as_ref();
    debug!("Parsing fasta file {path:?}");

    enum State {
        FileStart,
        ParseId,
        ParseComment,
        ParseSequence,
    }

    let mut input = CharacterIterator::new(BufReader::new(
        File::open(path).map_err(|error| anyhow!("Unable to open input file {path:?}: {error}"))?,
    ))
    .peekable();
    let mut state = State::FileStart;
    let mut current_record = FastaRecord {
        id: String::new(),
        comment: String::new(),
        sequence_handle: String::new(),
    };
    let mut records = Vec::<FastaRecord<String>>::new();

    'parser: loop {
        match state {
            State::FileStart => {
                let mut newline = true;

                'find_first_record: loop {
                    match input.next() {
                        Some(result) => match result? {
                            Character::Newline => newline = true,
                            Character::RecordStart => {
                                if newline {
                                    state = State::ParseId;
                                    break 'find_first_record;
                                } else {
                                    return Err(anyhow!(
                                        "First fasta record is not preceded by a newline character"
                                    ));
                                }
                            }
                            Character::Other(c) => {
                                newline = false;
                                if !c.is_whitespace() {
                                    return Err(anyhow!(
                                        "Found non-whitespace character before first fasta record: {c}"
                                    ));
                                }
                            }
                        },
                        None => {
                            return Err(anyhow!("Input file {path:?} contains no fasta record"));
                        }
                    }
                }
            }
            State::ParseId => 'collect_id: loop {
                match input.next() {
                    Some(result) => match result? {
                        Character::Newline => {
                            state = State::ParseSequence;
                            break 'collect_id;
                        }
                        Character::RecordStart => current_record.id.push('>'),
                        Character::Other(c) => {
                            if c.is_whitespace() {
                                state = State::ParseComment;
                                break 'collect_id;
                            } else {
                                current_record.id.push(c);
                            }
                        }
                    },
                    None => {
                        records.push(current_record);
                        break 'parser;
                    }
                }
            },
            State::ParseComment => 'collect_comment: loop {
                match input.next() {
                    Some(result) => match result? {
                        Character::Newline => {
                            state = State::ParseSequence;
                            break 'collect_comment;
                        }
                        Character::RecordStart => current_record.comment.push('>'),
                        Character::Other(c) => current_record.comment.push(c),
                    },
                    None => {
                        records.push(current_record);
                        break 'parser;
                    }
                }
            },
            State::ParseSequence => {
                let mut newline = true;

                'collect_sequence: loop {
                    match input.next() {
                        Some(result) => match result? {
                            Character::Newline => newline = true,
                            Character::RecordStart => {
                                if newline {
                                    records.push(current_record);
                                    current_record = FastaRecord {
                                        id: String::new(),
                                        comment: String::new(),
                                        sequence_handle: String::new(),
                                    };
                                    state = State::ParseId;
                                    break 'collect_sequence;
                                } else {
                                    current_record.sequence_handle.push('>');
                                    newline = false;
                                }
                            }
                            Character::Other(c) => {
                                current_record.sequence_handle.push(c);
                                newline = false;
                            }
                        },
                        None => {
                            records.push(current_record);
                            break 'parser;
                        }
                    }
                }
            }
        }
    }

    if pair {
        if records.len() != 2 {
            Err(anyhow!(
                "Expected paired fasta file with two records, but found {} records",
                records.len()
            ))
        } else {
            Ok(records)
        }
    } else if records.len() != 1 {
        Err(anyhow!(
            "Expected single-record fasta file, but found {} records",
            records.len()
        ))
    } else {
        Ok(records)
    }
}

enum Character {
    Newline,
    RecordStart,
    Other(char),
}

struct CharacterIterator<Reader: Read + ?Sized> {
    reader: BufReader<Reader>,
}

impl<Reader: Read> CharacterIterator<Reader> {
    fn new(reader: BufReader<Reader>) -> Self {
        Self { reader }
    }
}

impl<Reader: Read + ?Sized> Iterator for CharacterIterator<Reader> {
    type Item = Result<Character>;

    fn next(&mut self) -> Option<Self::Item> {
        self.reader
            .read_char_raw()
            .map(|result| {
                result.map(|c| {
                    if c == '\n' || c == '\r' {
                        Character::Newline
                    } else if c == '>' {
                        Character::RecordStart
                    } else {
                        Character::Other(c)
                    }
                })
            })
            .map_err(|error| anyhow!("Error reading character from fasta input file: {error}"))
            .transpose()
    }
}