from __future__ import annotations
import os
import subprocess
import tempfile
from dataclasses import dataclass
import numpy as np
_N_LEADING = 3
@dataclass
class FeatureTable:
columns: list[str] data: np.ndarray contig: np.ndarray pos: np.ndarray ref: np.ndarray manifest_path: str
def __len__(self) -> int:
return self.data.shape[0]
def features(
index: str,
alignments: str,
*,
binary: str = "rosalind",
max_depth: int = 1000,
max_read_len: int = 250,
mapq: int = 0,
memory_budget_mb: int | None = None,
enforce: bool = False,
workdir: str | None = None,
) -> FeatureTable:
tmp = workdir or tempfile.mkdtemp(prefix="rosalind-features-")
os.makedirs(tmp, exist_ok=True)
tsv = os.path.join(tmp, "features.tsv")
manifest = tsv + ".manifest.json"
cmd = [
binary, "features",
"--index", str(index),
"--alignments", str(alignments),
"--max-depth", str(max_depth),
"--max-read-len", str(max_read_len),
"--mapq-threshold", str(mapq),
"-o", tsv,
"--manifest", manifest,
]
if memory_budget_mb is not None:
cmd += ["--memory-budget-mb", str(memory_budget_mb)]
if enforce:
cmd += ["--enforce"]
subprocess.run(cmd, check=True)
return load_feature_tsv(tsv, manifest_path=manifest)
def load_feature_tsv(path: str, *, manifest_path: str = "") -> FeatureTable:
with open(path, "r", encoding="ascii") as fh:
header = fh.readline().lstrip("#").rstrip("\n").split("\t")
rows = [line.rstrip("\n").split("\t") for line in fh if line.strip()]
if not rows:
empty = np.empty((0, len(header) - _N_LEADING), dtype=np.float64)
return FeatureTable(header[_N_LEADING:], empty, np.array([]), np.array([], dtype=np.int64), np.array([]), manifest_path)
contig = np.array([r[0] for r in rows], dtype="S32")
pos = np.array([int(r[1]) for r in rows], dtype=np.int64)
ref = np.array([r[2] for r in rows], dtype="S1")
data = np.array([r[_N_LEADING:] for r in rows], dtype=np.float64)
return FeatureTable(header[_N_LEADING:], data, contig, pos, ref, manifest_path)
if __name__ == "__main__":
import sys
if len(sys.argv) < 4:
print("usage: python rosalind.py <binary> <index> <sorted.bam>", file=sys.stderr)
raise SystemExit(2)
ft = features(sys.argv[2], sys.argv[3], binary=sys.argv[1])
print(f"loaded {len(ft)} loci x {ft.data.shape[1]} numeric features")
print("columns:", ft.columns)
print("receipt:", ft.manifest_path)