{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f7952927-1f3f-49cd-9f1c-d6dad0835ae9",
   "metadata": {},
   "source": [
    "# HDF5 compressed chunk direct read\n",
    "\n",
    "This notebooks illustrate how to read compressed chunks directly and decompress them from Python for `Blosc2` and `Bitshuffle` filters.\n",
    "\n",
    "It compares this approach with reading compressed chunks with `h5py` and `hdf5plugin`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d452efbf-8c70-45ab-bb3a-885491601785",
   "metadata": {
    "tags": []
   },
   "source": [
    "## hdf5plugin config\n",
    "\n",
    "Performance changes depending on `hdf5plugin` build config, environment variables (`OPENMP_NUM_THREADS` and `BLOSC_NTHREADS`) and available CPU cores."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fc1c65ce-75b9-460f-9065-be975e2ab6c8",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of CPU: 1; Affinity: {0}\n",
      "env:\n",
      "  OPENMP_NUM_THREADS: 1\n",
      "  BLOSC_NTHREADS: 1\n",
      "\n",
      "hdf5plugin:\n",
      "  Version: 4.2.0\n",
      "  Build config:\n",
      "  openmp: True\n",
      "  native: True\n",
      "  bmi2: True\n",
      "  sse2: True\n",
      "  avx2: True\n",
      "  avx512: False\n",
      "  cpp11: True\n",
      "  cpp14: True\n",
      "  ipp: False\n",
      "  filter_file_extension: .so\n",
      "  embedded_filters: ('blosc', 'blosc2', 'bshuf', 'bzip2', 'fcidecomp', 'lz4', 'sz', 'sz3', 'zfp', 'zstd')\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Set affinity and multithreading env. var. before any import\n",
    "import os\n",
    "\n",
    "os.sched_setaffinity(0, [0])\n",
    "\n",
    "AFFINITY = os.sched_getaffinity(0)\n",
    "NCPU = len(AFFINITY)\n",
    "\n",
    "print(f\"Number of CPU: {NCPU}; Affinity: {AFFINITY}\")\n",
    "\n",
    "os.environ[\"OPENMP_NUM_THREADS\"] = str(NCPU)\n",
    "os.environ[\"BLOSC_NTHREADS\"] = str(NCPU)\n",
    "print(f\"\"\"env:\n",
    "  OPENMP_NUM_THREADS: {os.environ.get(\"OPENMP_NUM_THREADS\", \"unset\")}\n",
    "  BLOSC_NTHREADS: {os.environ.get(\"BLOSC_NTHREADS\", \"unset\")}\n",
    "\"\"\")\n",
    "\n",
    "import h5py\n",
    "import hdf5plugin\n",
    "\n",
    "config = hdf5plugin.get_config()\n",
    "\n",
    "print(f\"\"\"hdf5plugin:\n",
    "  Version: {hdf5plugin.version}\n",
    "  Build config:\n",
    "{'''\n",
    "'''.join(f'  {k}: {v}' for k, v in config.build_config._asdict().items())}\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89f06f93-9538-43cb-b1d9-f8c4cca8ada8",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Demo data\n",
    "\n",
    "Data file is available here: http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5\n",
    "\n",
    "Prepare 2 files with 2 compressed datasets with `Blosc2` and `Bitshuffle` filters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e06d1bbe-c23b-4c5f-a3d6-51c98018afa9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Download dataset\n",
    "!wget -O /dev/shm/kevlar.h5 http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3d62a294-0016-40e2-83d3-d0829be0b81f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import h5py\n",
    "import hdf5plugin\n",
    "\n",
    "with h5py.File(\"/dev/shm/kevlar.h5\", \"r\") as h:\n",
    "    data_ref = h[\"/entry/data/data\"][500]\n",
    "\n",
    "with h5py.File(\"/dev/shm/kevlar_blosc2.h5\", \"w\") as h:\n",
    "    h.create_dataset(\n",
    "        \"data\",\n",
    "        data=data_ref,\n",
    "        chunks=data_ref.shape,\n",
    "        compression=hdf5plugin.Blosc2(\n",
    "            cname='lz4',\n",
    "            clevel=5,\n",
    "            filters=hdf5plugin.Blosc2.BITSHUFFLE,\n",
    "        ),\n",
    "    )\n",
    "with h5py.File(\"/dev/shm/kevlar_bitshuffle.h5\", \"w\") as h:\n",
    "    h.create_dataset(\n",
    "        \"data\",\n",
    "        data=data_ref,\n",
    "        chunks=data_ref.shape,\n",
    "        compression=hdf5plugin.Bitshuffle(),\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da76a266-89ab-45aa-9dd4-b9117f9e5c36",
   "metadata": {},
   "source": [
    "## With Blosc2\n",
    "\n",
    "Read compressed chunk with `read_direct_chunk` and decompress it with `blosc2`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "abef1f80-011a-489b-8dc7-6e5a84d4be4a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import blosc2\n",
    "import numpy\n",
    "\n",
    "def decompress_blosc2_chunk(chunk: bytes, array: numpy.ndarray):\n",
    "    \"\"\"Decompress chunk data to provided array\"\"\"\n",
    "    blosc2.schunk_from_cframe(chunk).get_slice(out=array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1766ae0b-1130-48cb-9378-47baef27b68b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Allocate array\n",
    "with h5py.File(\"/dev/shm/kevlar_blosc2.h5\", \"r\") as h:\n",
    "    ds = h[\"data\"]\n",
    "    array = numpy.empty(ds.shape, dtype=ds.dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "228421e8-60d3-4ae8-b8b2-86e3b1d88c7d",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 3.7 ms ± 209 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%timeit -r10 -n10 -o -q\n",
    "\n",
    "# Read compressed chunk and decompress into array\n",
    "with h5py.File(\"/dev/shm/kevlar_blosc2.h5\", \"r\") as h5file:\n",
    "    ds = h5file[\"data\"]\n",
    "    filter_mask, chunk = ds.id.read_direct_chunk(ds.id.get_chunk_info(0).chunk_offset)\n",
    "    decompress_blosc2_chunk(chunk, array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "515244ea-3fe7-4065-9f9b-85e61a58f778",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 8.92 ms ± 116 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%timeit -r10 -n10 -o -q\n",
    "\n",
    "# Read data through h5py and libhdf5\n",
    "with h5py.File(\"/dev/shm/kevlar_blosc2.h5\", \"r\") as h5file:\n",
    "    data = h5file[\"data\"][()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4075e042-f8bd-46f9-bcb9-000a6e20dbca",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 5.18 ms ± 110 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%timeit -r10 -n10 -o -q\n",
    "\n",
    "# Use read_direct\n",
    "with h5py.File(\"/dev/shm/kevlar_blosc2.h5\", \"r\") as h:\n",
    "    h[\"data\"].read_direct(array)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3fb49a0-c3d6-4f73-a553-26bae5fa7362",
   "metadata": {
    "tags": []
   },
   "source": [
    "## With Bitshuffle\n",
    "\n",
    "Read compressed chunk with `read_direct_chunk` and decompress it with `bitshuffle`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9fd29f2b-d5a1-4504-bc60-25aab2f0732b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import struct\n",
    "import bitshuffle\n",
    "import numpy\n",
    "\n",
    "def decompress_bslz4_chunk(payload, dtype, chunk_shape):\n",
    "    \"\"\"This function decompresses ONE chunk with bitshuffle-LZ4. \n",
    "    The library needs to be compiled without OpenMP when using threads !\n",
    "    \n",
    "    :param payload: string with the compressed data as read by h5py.\n",
    "    :param dtype: data type of the stored content\n",
    "    :param chunk_shape: shape of one chunk\n",
    "    :return: decompressed chunk\"\"\"\n",
    "    total_nbytes, block_nbytes = struct.unpack(\">QI\", payload[:12])\n",
    "    block_size = block_nbytes // dtype.itemsize\n",
    "\n",
    "    arr = numpy.frombuffer(payload, dtype=numpy.uint8, offset=12)  # No copy here\n",
    "    chunk_data = bitshuffle.decompress_lz4(arr, chunk_shape, dtype, block_size)\n",
    "    return chunk_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5324783e-7726-4e22-98a2-9319776eae76",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 4.35 ms ± 104 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%timeit -r10 -n10 -o -q\n",
    "\n",
    "# Read compressed chunk and decompress it\n",
    "with h5py.File(\"/dev/shm/kevlar_bitshuffle.h5\", \"r\") as h:\n",
    "    ds = h[\"data\"]\n",
    "    filter_mask, chunk = ds.id.read_direct_chunk(ds.id.get_chunk_info(0).chunk_offset)\n",
    "    array = decompress_bslz4_chunk(chunk, ds.dtype, ds.chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "07581996-beb6-4921-b88d-84c988fc38ef",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 9.24 ms ± 154 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%timeit -r10 -n10 -o -q\n",
    "\n",
    "# Read data through h5py and libhdf5\n",
    "with h5py.File(\"/dev/shm/kevlar_bitshuffle.h5\", \"r\") as h:\n",
    "    data = h[\"data\"][()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e1e326eb-b3d7-4cf4-92b1-3946bd8b8d7e",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 5.45 ms ± 175 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%timeit -r10 -n10 -o -q\n",
    "\n",
    "# Use read_direct\n",
    "with h5py.File(\"/dev/shm/kevlar_bitshuffle.h5\", \"r\") as h:\n",
    "    h[\"data\"].read_direct(array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd2e177b-e624-4e0c-89f4-62f5b86a4d29",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
