bed-reader 1.0.6

Read and write the PLINK BED format, simply and efficiently.
Documentation
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set `ssd_path` to a folder in which a file can be placed. It does not need to be \"ssd\" drive, but faster drives are better."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "import time\n",
    "from pathlib import Path\n",
    "import numpy as np\n",
    "from bed_reader import create_bed, open_bed\n",
    "\n",
    "ssd_path = Path(\"m:/deldir/bench2\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "On my system, it takes about 5 minutes to generate the 61 GB file (1/16th of a 1TB drive). The file is filed with a pattern of 0, 1, and 2 values.\n",
    "\n",
    "In this case, we are pretending that we can more easily get all the SNP information for one individual at a time, so we write the file in the (less common) \"individual-major\" format."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "iid_count = 250_000\n",
    "sid_count = 1_000_000\n",
    "\n",
    "file_path = ssd_path / f\"{iid_count}x{sid_count}mode0.bed\"\n",
    "\n",
    "snp_row = np.array(range(sid_count)) % 3\n",
    "snp_row = snp_row.astype(np.uint8)\n",
    "\n",
    "start_time = time.time()\n",
    "with create_bed(file_path, iid_count=iid_count, sid_count=sid_count, major=\"individual\") as bed_writer:\n",
    "    for iid_index in range(iid_count):\n",
    "        if iid_index % 1_000 == 0:\n",
    "            current_time = time.time()\n",
    "            elapsed_time = current_time - start_time\n",
    "            if iid_index > 0:\n",
    "                estimated_total_time = elapsed_time / iid_index * iid_count\n",
    "                time_remaining = estimated_total_time - elapsed_time\n",
    "                print(f\"iid_index={iid_index:_}, Time elapsed: {elapsed_time:.2f} s, Estimated total time: {estimated_total_time:.2f} s, Time remaining: {time_remaining:.2f} s\")\n",
    "            else:\n",
    "                print(f\"Starting processing at iid_index={iid_index:_}\")\n",
    "        bed_writer.write(snp_row)\n",
    "        snp_row[iid_index] = (snp_row[iid_index] + 1) % 3\n",
    "total_time = time.time() - start_time\n",
    "print(f\"Processing complete. Total time taken: {total_time:.2f} s\")        \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We read from the file to check that it contains the expected values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "with open_bed(file_path) as bed_reader:\n",
    "    val = bed_reader.read(np.s_[:10, :10])\n",
    "\n",
    "assert np.all(\n",
    "    val\n",
    "    == [\n",
    "        [0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0],\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "metadata": {}
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(250000, 1000000)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bed_reader = open_bed(file_path) # open and keep open\n",
    "bed_reader.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we use `bed-reader` to convert the file the less common individual-major mode (also known as \"mode 0\") into the usual SNP-major mode (mode 1).\n",
    "\n",
    "On my machine, this takes about an hour. The script works my reading the data for 1000 SNPs at a time. This requires more memory than reading data of 1 SNP at a time, but is much, much faster. If you need to use less memory, change `sid_at_a_time` to a smaller number such as 500, 250, 100, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "sid_at_a_time = 1000\n",
    "file_path1 = ssd_path / f\"{iid_count}x{sid_count}mode1.bed\"\n",
    "start_time = time.time()\n",
    "\n",
    "assert (\n",
    "    bed_reader.major == \"individual\"\n",
    "), \"No need to transpose if major is already SNP-major\"\n",
    "\n",
    "with create_bed(\n",
    "    file_path1,\n",
    "    iid_count=bed_reader.iid_count,\n",
    "    sid_count=bed_reader.sid_count,\n",
    "    properties=bed_reader.properties,\n",
    "    major=\"SNP\",\n",
    ") as bed_writer:\n",
    "    for sid_index in range(0, bed_reader.sid_count, sid_at_a_time):\n",
    "        if sid_index % 1 == 0:\n",
    "            current_time = time.time()\n",
    "            elapsed_time = current_time - start_time\n",
    "            if sid_index > 0:\n",
    "                estimated_total_time = elapsed_time / sid_index * sid_count\n",
    "                time_remaining = estimated_total_time - elapsed_time\n",
    "                print(\n",
    "                    f\"sid_index={sid_index:_}, Time elapsed: {elapsed_time:.2f} s, Estimated total time: {estimated_total_time:.2f} s, Time remaining: {time_remaining:.2f} s\"\n",
    "                )\n",
    "            else:\n",
    "                print(f\"Starting processing at sid_index={sid_index:_}\")\n",
    "        iid_column_by_chunk = bed_reader.read(\n",
    "            np.s_[:, sid_index : sid_index + sid_at_a_time], dtype=np.int8\n",
    "        )\n",
    "        for iid_column in iid_column_by_chunk.T:\n",
    "            bed_writer.write(iid_column)\n",
    "total_time = time.time() - start_time\n",
    "print(f\"Processing complete. Total time taken: {total_time:.2f} s\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We confirm that the new copy of the file is the same size and starts with the same values as before."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "metadata": {}
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(250000, 1000000)\n"
     ]
    }
   ],
   "source": [
    "with open_bed(file_path1) as bed_reader1:\n",
    "    print(bed_reader1.shape)\n",
    "    val1 = bed_reader1.read(np.s_[:10, :10])\n",
    "assert np.all(\n",
    "    val1    == [\n",
    "        [0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0],\n",
    "        [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0],\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}