Documentation
import re


def calculate_identity(cigar_string):
    """
    Calculate the identity of a sequence based on its CIGAR string.

    Args:
        cigar_string (str): CIGAR string (e.g., "17S3=1X21=1D8=1D23=1I").

    Returns:
        float: Identity value as a ratio.
    """
    # Parse the CIGAR string using regex
    operations = re.findall(r"(\d+)([MIDNSHP=X])", cigar_string)

    # Initialize counts
    matches = 0
    mismatches = 0
    insertions = 0
    deletions = 0

    # Process each operation
    for length, op in operations:
        length = int(length)
        if op == "=":
            matches += length
        elif op == "X":
            mismatches += length
        elif op == "I":
            insertions += length
        elif op == "D":
            deletions += length

    # Compute identity
    total_bases = matches + mismatches + insertions + deletions
    if total_bases == 0:
        return 0  # Avoid division by zero

    identity = matches / total_bases
    return identity


def calculate_reference_length(cigar_string):
    """
    Calculate the reference length of a sequence alignment based on its CIGAR string.

    Args:
        cigar_string (str): CIGAR string (e.g., "17S3=1X21=1D8=1D23=1I").

    Returns:
        int: Reference length.
    """
    # Parse the CIGAR string using regex
    operations = re.findall(r"(\d+)([MIDNSHP=X])", cigar_string)

    # Initialize reference length
    reference_length = 0

    # Process each operation
    for length, op in operations:
        length = int(length)
        if op in ("=", "X", "D"):  # Contributes to reference length
            reference_length += length

    return reference_length


if __name__ == "__main__":

    # 示例调用
    cigar = "17=1I3=2I18=1I4=1D20=4I5=3I15=1I6=1I4=1D22=1I5=1D16=1D7=1I8=2X5=1I48=2I9=1I2=1D20=1I12=1D10=1X30=1D4=1D9=1I42=1I4=1D27=2D3=1D21=1D3=1D43=1D7=2D11=1D14=2I8=1D8=2X15=2D5=2I3=1D24=1D18=1D3=1D7=1D21=1X10=1D5=1I36=1I28=1X1=2D2=1D1=1D5=1I14=1I21=1I5=1D5=1I10=1D3=2D24=1I9=1D2=2D12=1D15=1D16=1D3=1D3=1D4=1D2=1D22=1I13=1I30=1D11=1I21=1D15=1I6=1I9=1D32=2D4=1X1=1X5=1I26=3I11=1I5=1D27=2D12=2I6=1D5=1I11=1D2=1D9=1D7=1I3=1I18=1D16=1X1=1X7=1I39=1X4=1D5=2I18=1D6=1D19=1I16=1I3=1X5=2I5=2X32=1I8=1D10=1D6=1D16=1I35=1D4=1I4=1I17=1I34=1D8=1D21=2D5=1D8=1D21=1I15=2I14=1D14=1X2=1D6=2I5=1I22=1D4=1D3=1I23=2D8=2I11=1X7=1D15=2I2=1D10=1D9=2I13=1I13=4I2=10I2=5I13=1I11=8S"
    identity = calculate_identity(cigar)
    print(f"CIGAR: {cigar}")
    print(f"Identity: {identity:.2%}")
    length = calculate_reference_length(cigar)
    print(f"ref_aligned_length:{length}")

    cigar = "17=1I3=2I18=1I4=1D20=4I5=3I15=1I6=1I4=1D22=1I5=1D16=1D7=1I8=2X5=1I48=2I9=1I2=1D20=1I12=1D10=1X30=1D4=1D9=1I42=1I4=1D27=2D3=1D21=1D3=1D43=1D7=2D11=1D14=2I8=1D8=2X15=2D5=2I3=1D24=1D18=1D3=1D7=1D21=1X10=1D5=1I36=1I28=1X1=2D2=1D1=1D5=1I14=1I21=1I5=1D5=1I10=1D3=2D24=1I9=1D2=2D12=1D15=1D16=1D3=1D3=1D4=1D2=1D22=1I13=1I30=1D11=1I21=1D15=1I6=1I9=1D32=2D4=1X1=1X5=1I26=3I11=1I5=1D27=2D12=2I6=1D5=1I11=1D2=1D9=1D7=1I3=1I18=1D16=1X1=1X7=1I39=1X4=1D5=2I18=1D6=1D19=1I16=1I3=1X5=2I5=2X32=1I8=1D10=1D6=1D16=1I35=1D4=1I4=1I17=1I34=1D8=1D21=2D5=1D8=1D21=1I15=2I14=1D14=1X2=1D6=2I5=1I22=1D4=1D3=1I23=2D8=2I11=1X7=1D15=2I2=1D10=1D9=2I13=1I13=56S"
    identity = calculate_identity(cigar)
    print(f"CIGAR: {cigar}")
    print(f"Identity: {identity:.2%}")
    length = calculate_reference_length(cigar)
    print(f"ref_aligned_length:{length}")