block-aligner 0.5.1

SIMD-accelerated library for computing global and X-drop affine gap penalty sequence-to-sequence or sequence-to-profile alignments using an adaptive block-based algorithm.
Documentation
import random

lookup_path = "../data/scop/scop_lookup.fix.tsv"
pssm_path = "../data/scop/scop_mmseqs_pssm.pssm"
seq_path = "../data/scop/scop.fasta"
res_path = "../data/scop/pairs.pssm"

seq_to_scop = {}
families = {}

def process_scop_id(scop_id):
    #return scop_id[:scop_id.rindex(".")]
    return scop_id

with open(lookup_path) as f:
    for line in f:
        seq_id, scop_id = line.strip().split()
        scop_id = process_scop_id(scop_id)
        seq_to_scop[seq_id] = scop_id
        families[scop_id] = ([], [])

seq_lines = {}
seq_seqs = {}

with open(pssm_path) as f:
    curr_seq = None
    for line in f:
        if line.startswith("#"):
            curr_seq = line[1:].strip()
            seq_lines[curr_seq] = []
        else:
            seq_lines[curr_seq].append(line.strip())

with open(seq_path) as f:
    curr_seq = None
    for line in f:
        if line.startswith(">"):
            curr_seq = line[1:].strip()
            seq_seqs[curr_seq] = ""
        else:
            seq_seqs[curr_seq] += line.strip()

for seq_id, pssm in seq_lines.items():
    scop_id = seq_to_scop[seq_id]
    families[scop_id][0].append(pssm)

for seq_id, seq in seq_seqs.items():
    if not seq_id in seq_to_scop:
        continue
    scop_id = seq_to_scop[seq_id]
    families[scop_id][1].append(seq)

seq_pssm_pairs = []

def consensus_seq(lines):
    return "".join([s.split()[1] for s in lines[1:]])

for _, (pssm_family, seq_family) in families.items():
    random.shuffle(pssm_family)
    random.shuffle(seq_family)
    for i in range(min(len(pssm_family), len(seq_family))):
        pssm = pssm_family[i]
        seq = seq_family[i]
        seq_pssm_pairs.append((seq, consensus_seq(pssm), pssm))

print("Number of seq-pssm pairs:", len(seq_pssm_pairs))

with open(res_path, "w") as f:
    for seq, cns, pssm in seq_pssm_pairs:
        f.write("#" + seq + "\n")
        f.write("#" + cns + "\n")
        f.write("\n".join(pssm) + "\n")