{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Labwork 3\n",
    "This new labwork focuses on the SCAN pattern, which is the Swiss-knife of the little parallel programmer...\n",
    "\n",
    "The three exercises are dealing with the Inclusive SCAN version, from left to right (no reverse)."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 1\n",
    "In this exercise, you must implement a **block-scan**. \n",
    "It is a scan into a single block, for a small amount of data (256 values). \n",
    "Then, you should implement the PRAM algorithm, with the correct per-warp synchronizations...\n",
    "\n",
    "NB: you must write the three device functions:\n",
    "- `load_shared_memory`\n",
    "- `pointer_jumping`\n",
    "- `save_shared_memory`\n",
    "The kernel should be read, too..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import annotations\n",
    "\n",
    "import numpy as np\n",
    "from numba.np.numpy_support import from_dtype\n",
    "from numba import cuda, core\n",
    "\n",
    "\n",
    "class BlockScan(object):\n",
    "    \"\"\"Create a scan object that scans values using a given binary\n",
    "    function. The binary function is compiled once and cached inside this\n",
    "    object. Keeping this object alive will prevent re-compilation.\n",
    "\n",
    "    The scan is functional for a limited number of values, as it runs into a single block.\n",
    "    THe number of values should be less than 256.\n",
    "    \"\"\"\n",
    "\n",
    "    _cache = {}\n",
    "\n",
    "    _WARP_SIZE = 32\n",
    "    _NUM_WARPS = 8\n",
    "\n",
    "    @staticmethod\n",
    "    def _gpu_kernel_factory(fn, np_type):\n",
    "        \"\"\"Factory of kernels for the block-scan problem...\n",
    "\n",
    "        This function returns a Cuda Kernel that does block-scan of some data using a given binary functor.\"\"\"\n",
    "\n",
    "        scan_op = cuda.jit(device=True)(fn)\n",
    "\n",
    "        max_block_size = BlockScan._NUM_WARPS * BlockScan._WARP_SIZE\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def load_shared_memory(shared_memory, d_input):\n",
    "            local_tid = cuda.threadIdx.x\n",
    "            # TODO: load data into shared memory\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def pointer_jumping(shared_memory, jump):\n",
    "            tid = cuda.threadIdx.x\n",
    "            # TODO: implement the pointer jumping\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def save_shared_memory(shared_memory, d_output):\n",
    "            local_tid = cuda.threadIdx.x\n",
    "            # TODO: save data from shared memory\n",
    "\n",
    "        def gpu_scan_block(d_input, d_output):\n",
    "            \"\"\"\n",
    "            Per block SCAN...\n",
    "            \"\"\"\n",
    "            # move data to shared memory\n",
    "            shared_memory = cuda.shared.array(shape=max_block_size, dtype=np_type)\n",
    "            load_shared_memory(shared_memory, d_input)\n",
    "\n",
    "            # TODO: implements the logics (pointer jumping)\n",
    "\n",
    "            # now stores the result\n",
    "            save_shared_memory(shared_memory, d_output)\n",
    "\n",
    "        return cuda.jit(gpu_scan_block)\n",
    "\n",
    "    def __init__(self, functor):\n",
    "        \"\"\"\n",
    "        :param functor: A function implementing a binary operation for\n",
    "                        scan. It will be compiled as a CUDA device\n",
    "                        function using ``cuda.jit(device=True)``.\n",
    "        \"\"\"\n",
    "        self._functor = functor\n",
    "\n",
    "    def _compile(self, dtype):\n",
    "        key = self._functor, dtype\n",
    "        if key not in self._cache:\n",
    "            self._cache[key] = BlockScan._gpu_kernel_factory(self._functor, from_dtype(dtype))\n",
    "        return self._cache[key]\n",
    "\n",
    "    def __call__(self, d_input, d_output, verbose, stream=cuda.default_stream()):\n",
    "        \"\"\" Performs a per-block SCAN.\n",
    "\n",
    "        :param d_input: A host or device array.\n",
    "        :param stream: Optional CUDA stream in which to perform the scan.\n",
    "                    If no stream is specified, the default stream of 0 is used.\n",
    "        \"\"\"\n",
    "\n",
    "        # ensure 1d array\n",
    "        if d_input.ndim != 1:\n",
    "            raise TypeError(\"only support 1D array\")\n",
    "\n",
    "        # ensure size > 0\n",
    "        if d_input.size < 1:\n",
    "            raise ValueError(\"array's length is 0\")\n",
    "\n",
    "        # ensure arrays' size are the same\n",
    "        if d_input.size != d_output.size:\n",
    "            raise ValueError(\"arrays' length are different ({d_input.size} / {d_output.size}\")\n",
    "\n",
    "        # ensure size < 256\n",
    "        max_block_size = BlockScan._WARP_SIZE * BlockScan._NUM_WARPS\n",
    "        if d_input.size < max_block_size:\n",
    "            raise ValueError(\"array's length is greater than max block size\")\n",
    "\n",
    "        _sav, core.config.CUDA_LOW_OCCUPANCY_WARNINGS = core.config.CUDA_LOW_OCCUPANCY_WARNINGS, False\n",
    "\n",
    "        kernel = self._compile(d_input.dtype)\n",
    "\n",
    "        # Perform the reduction on the GPU\n",
    "        start_event = cuda.event(True)\n",
    "        start_event.record(stream=stream)\n",
    "\n",
    "        nb_threads = max_block_size\n",
    "        nb_blocks = 1\n",
    "        kernel[nb_blocks, nb_threads, stream](d_input, d_output)\n",
    "\n",
    "        stop_event = cuda.event(True)\n",
    "        stop_event.record(stream=stream)\n",
    "        stop_event.synchronize()\n",
    "        ct = cuda.event_elapsed_time(start_event, stop_event)\n",
    "\n",
    "        core.config.CUDA_LOW_OCCUPANCY_WARNINGS = _sav\n",
    "\n",
    "        return d_output.copy_to_host(), ct\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    def display(array):\n",
    "        for i in range(array.size):\n",
    "            print(f\"{array[i]} \", end=\"\")\n",
    "            if (i % 16) == 15:\n",
    "                print(\"\")\n",
    "\n",
    "\n",
    "    def check(expected, result):\n",
    "        it_works = np.array_equal(expected, result)\n",
    "        if it_works:\n",
    "            print(f'- seems to work')\n",
    "        else:\n",
    "            print(f'- seems to not work, here is your result:')\n",
    "            display(result)\n",
    "\n",
    "\n",
    "    def test_int32(size):\n",
    "        scanner = BlockScan(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.int32)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            result, time = scanner(cuda.to_device(h_array), cuda.to_device(h_array), verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        check(np.cumsum(h_array), result)\n",
    "\n",
    "\n",
    "    def test_float32(size):\n",
    "        scanner = BlockScan(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.float32)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            result, time = scanner(cuda.to_device(h_array), cuda.to_device(h_array), verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        check(np.cumsum(h_array), result)\n",
    "\n",
    "\n",
    "    def test_float64(size):\n",
    "        scanner = BlockScan(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.float64)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            result, time = scanner(cuda.to_device(h_array), cuda.to_device(h_array), verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        check(np.cumsum(h_array), result)\n",
    "\n",
    "\n",
    "    test_int32(1 << 8)\n",
    "    test_float32(1 << 8)\n",
    "    test_float64(1 << 8)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 2\n",
    "This exercise deals with a block SCAN, but here applying the **hybrid** strategy saw in lecture 3. \n",
    "\n",
    "The idea is then (for a single block):\n",
    "- to load the data in shared memory\n",
    "- to do a SCAN per warp (so without warp synchronization)\n",
    "- to do a warp synchronization\n",
    "- to do a SCAN considering the last value of each warp (into a single warp, so no warp synchronization)\n",
    "- to do a warp synchronization\n",
    "- to apply the final MAP for all warps except the first...\n",
    "\n",
    "Try this wonderful strategy below.\n",
    "\n",
    "NB: the computation time will be roughly speaking the same, but the idea is to apply this algorithm in simple condition first, before to do it at a larger scale."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import annotations\n",
    "\n",
    "import numpy as np\n",
    "from numba.np.numpy_support import from_dtype\n",
    "from numba import cuda, core\n",
    "\n",
    "\n",
    "class BlockScanHybrid(object):\n",
    "    \"\"\"Create a scan object that scans values using a given binary\n",
    "    function. The binary function is compiled once and cached inside this\n",
    "    object. Keeping this object alive will prevent re-compilation.\n",
    "\n",
    "    The scan is functional for a limited number of values, as it runs into a single block.\n",
    "    THe number of values should be less than 256.\n",
    "    \"\"\"\n",
    "\n",
    "    _cache = {}\n",
    "\n",
    "    _WARP_SIZE = 32\n",
    "    _NUM_WARPS = 8\n",
    "\n",
    "    @staticmethod\n",
    "    def _gpu_kernel_factory(fn, np_type):\n",
    "        \"\"\"Factory of kernels for the block-scan problem...\n",
    "\n",
    "        This function returns a Cuda Kernel that does block-scan of some data using a given binary functor.\"\"\"\n",
    "\n",
    "        scan_op = cuda.jit(device=True)(fn)\n",
    "\n",
    "        max_block_size = BlockScanHybrid._NUM_WARPS * BlockScanHybrid._WARP_SIZE\n",
    "        \n",
    "        @cuda.jit(device=True)\n",
    "        def load_shared_memory(shared_memory, d_input):\n",
    "            local_tid = cuda.threadIdx.x\n",
    "            # TODO: load data into shared memory\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def scan_per_warp(shared_memory):\n",
    "            tid = cuda.threadIdx.x\n",
    "            warp_tid = tid & 31\n",
    "            # TODO: per warp scan\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def get_extra_value(shared_memory, last):\n",
    "            tid = cuda.threadIdx.x\n",
    "            warp_tid = tid & 31\n",
    "            # TODO: collect last value of each warp (copy into \"last\")\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def apply_final_map(shared_memory, last):\n",
    "            tid = cuda.threadIdx.x\n",
    "            warp_id = tid >> 5\n",
    "            # TODO: apply final MAP\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def save_shared_memory(shared_memory, d_output):\n",
    "            local_tid = cuda.threadIdx.x\n",
    "            # TODO: save data from shared memory\n",
    "\n",
    "        def gpu_scan_block(d_input, d_output):\n",
    "            \"\"\"\n",
    "            Per block SCAN...\n",
    "            \"\"\"\n",
    "            # move data to shared memory\n",
    "            shared_memory = cuda.shared.array(shape=max_block_size, dtype=np_type)\n",
    "            extra_shared_memory = cuda.shared.array(shape=32, dtype=np_type)\n",
    "            load_shared_memory(shared_memory, d_input)\n",
    "\n",
    "            # TODO: implements the logics\n",
    "\n",
    "        return cuda.jit(gpu_scan_block)\n",
    "\n",
    "    def __init__(self, functor):\n",
    "        \"\"\"\n",
    "        :param functor: A function implementing a binary operation for\n",
    "                        scan. It will be compiled as a CUDA device\n",
    "                        function using ``cuda.jit(device=True)``.\n",
    "        \"\"\"\n",
    "        self._functor = functor\n",
    "\n",
    "    def _compile(self, dtype):\n",
    "        key = self._functor, dtype\n",
    "        if key not in self._cache:\n",
    "            self._cache[key] = BlockScanHybrid._gpu_kernel_factory(self._functor, from_dtype(dtype))\n",
    "        return self._cache[key]\n",
    "\n",
    "    def __call__(self, d_input, d_output=None, verbose=False, stream=cuda.default_stream()):\n",
    "        \"\"\" Performs a per-block SCAN.\n",
    "\n",
    "        :param d_input: A host or device array.\n",
    "        :param stream: Optional CUDA stream in which to perform the scan.\n",
    "                    If no stream is specified, the default stream of 0 is used.\n",
    "        \"\"\"\n",
    "\n",
    "        # ensure 1d array\n",
    "        if d_input.ndim != 1:\n",
    "            raise TypeError(\"only support 1D array\")\n",
    "\n",
    "        # ensure size > 0\n",
    "        if d_input.size < 1:\n",
    "            raise ValueError(\"array's length is 0\")\n",
    "\n",
    "        # ensure arrays' size are the same\n",
    "        if d_input.size != d_output.size:\n",
    "            raise ValueError(\"arrays' length are different ({d_input.size} / {d_output.size}\")\n",
    "\n",
    "        # ensure size < 256\n",
    "        max_block_size = BlockScanHybrid._WARP_SIZE * BlockScanHybrid._NUM_WARPS\n",
    "        if d_input.size < max_block_size:\n",
    "            raise ValueError(\"array's length is greater than max block size\")\n",
    "\n",
    "        _sav, core.config.CUDA_LOW_OCCUPANCY_WARNINGS = core.config.CUDA_LOW_OCCUPANCY_WARNINGS, False\n",
    "\n",
    "        kernel = self._compile(d_input.dtype)\n",
    "        # turn on GPU...\n",
    "        nb_threads = max_block_size\n",
    "        nb_blocks = 1\n",
    "        kernel[nb_blocks, nb_threads, stream](d_input, d_output)\n",
    "\n",
    "        # Perform the reduction on the GPU\n",
    "        start_event = cuda.event(True)\n",
    "        start_event.record(stream=stream)\n",
    "\n",
    "        nb_threads = max_block_size\n",
    "        nb_blocks = 1\n",
    "        kernel[nb_blocks, nb_threads, stream](d_input, d_output)\n",
    "\n",
    "        stop_event = cuda.event(True)\n",
    "        stop_event.record(stream=stream)\n",
    "        stop_event.synchronize()\n",
    "        ct = cuda.event_elapsed_time(start_event, stop_event)\n",
    "\n",
    "        core.config.CUDA_LOW_OCCUPANCY_WARNINGS = _sav\n",
    "\n",
    "        return d_output.copy_to_host(), ct\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    def display(array):\n",
    "        for i in range(array.size):\n",
    "            print(f\"{array[i]} \", end=\"\")\n",
    "            if (i % 16) == 15:\n",
    "                print(\"\")\n",
    "\n",
    "\n",
    "    def check(expected, result):\n",
    "        it_works = True\n",
    "        for i in range(result.size):\n",
    "            if expected[i] != result[i]:\n",
    "                it_works = False\n",
    "        if it_works:\n",
    "            print(f'- seems to work')\n",
    "        else:\n",
    "            print(f'- seems to not work, here is your result:')\n",
    "            display(result)\n",
    "\n",
    "\n",
    "    def test_int32(size):\n",
    "        scanner = BlockScanHybrid(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.int32)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            result, time = scanner(cuda.to_device(h_array), cuda.to_device(h_array), verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        check(np.cumsum(h_array), result)\n",
    "\n",
    "\n",
    "    def test_float32(size):\n",
    "        scanner = BlockScanHybrid(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.float32)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            result, time = scanner(cuda.to_device(h_array), cuda.to_device(h_array), verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        check(np.cumsum(h_array), result)\n",
    "\n",
    "\n",
    "    def test_float64(size):\n",
    "        scanner = BlockScanHybrid(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.float64)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            result, time = scanner(cuda.to_device(h_array), cuda.to_device(h_array), verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        check(np.cumsum(h_array), result)\n",
    "\n",
    "\n",
    "    test_int32(1 << 8)\n",
    "    test_float32(1 << 8)\n",
    "    test_float64(1 << 8)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise 3\n",
    "In this exercise you must implement the **full scan** for a given binary associative operator.\n",
    "\n",
    "The lecture 3 gives you all the clue for this purpose:\n",
    "- the per block SCAN, with its extra value,\n",
    "- the inter-block extra value scan which is a recursive call,\n",
    "- the MAP to end the SCAN by adding the extra block scanned values (with shift). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import annotations\n",
    "\n",
    "import numpy as np\n",
    "from numba.np.numpy_support import from_dtype\n",
    "from numba import cuda, core\n",
    "\n",
    "\n",
    "def display(array, all_values=False):\n",
    "    def aprint(data_range, noend=False):\n",
    "        for i in data_range:\n",
    "            print(f\"{array[i]} \", end=\"\")\n",
    "            if (i % 16) == 15: print(\"\")\n",
    "        if not noend: print(\"...\")\n",
    "\n",
    "    if all_values:\n",
    "        aprint(range(array.size), noend=True)\n",
    "        return\n",
    "    aprint(range(0, min(array.size, 16)))\n",
    "    if array.size < 256 - 16: return\n",
    "    aprint(range(256 - 16, min(array.size, 256 + 16)))\n",
    "    if array.size < 512 - 16: return\n",
    "    aprint(range(512 - 16, min(array.size, 512 + 16)))\n",
    "    if array.size < 768 - 16: return\n",
    "    aprint(range(768 - 16, min(array.size, 768 + 16)))\n",
    "    aprint(range(array.size - 16 - 256, array.size - 256 + 16))\n",
    "    aprint(range(array.size - 16, array.size), noend=True)\n",
    "\n",
    "\n",
    "class InclusiveScan(object):\n",
    "    \"\"\"Create a scan object that scans values using a given binary\n",
    "    function. The binary function is compiled once and cached inside this\n",
    "    object. Keeping this object alive will prevent re-compilation.\n",
    "\n",
    "    The scan is functional for a limited number of values, as it runs into a single block.\n",
    "    THe number of values should be less than 256.\n",
    "    \"\"\"\n",
    "\n",
    "    _kernels_block_scan = {}\n",
    "    _kernels_block_map = {}\n",
    "\n",
    "    _WARP_SIZE = 32\n",
    "    _NUM_WARPS = 8\n",
    "\n",
    "    @staticmethod\n",
    "    def _gpu_kernel_block_scan_factory(fn, np_type):\n",
    "        \"\"\"Factory of kernels for the block-scan problem...\n",
    "\n",
    "        This function returns a Cuda Kernel that does block-scan of some data using a given binary functor.\"\"\"\n",
    "\n",
    "        scan_op = cuda.jit(device=True)(fn)\n",
    "\n",
    "        max_block_size = InclusiveScan._NUM_WARPS * InclusiveScan._WARP_SIZE\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def load_shared_memory(shared_memory, d_input):\n",
    "            local_tid = cuda.threadIdx.x\n",
    "            tid = cuda.grid(1)\n",
    "            # TODO: load data into shared memory\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def pointer_jumping(shared_memory, jump):\n",
    "            tid = cuda.threadIdx.x\n",
    "            # TODO: implement the pointer jumping (full block)\n",
    "\n",
    "        @cuda.jit(device=True)\n",
    "        def save_shared_memory(shared_memory, d_output, d_extras):\n",
    "            local_tid = cuda.threadIdx.x\n",
    "            global_tid = cuda.grid(1)\n",
    "            # TODO: save data from shared memory\n",
    "            \n",
    "            # TODO: save last block value to \"d_extras\"!\n",
    "\n",
    "        def gpu_scan_block(d_input, d_output, d_extras):\n",
    "            \"\"\"\n",
    "            Per block SCAN...\n",
    "            \"\"\"\n",
    "            # move data to shared memory\n",
    "            shared_memory = cuda.shared.array(shape=max_block_size, dtype=np_type)\n",
    "            load_shared_memory(shared_memory, d_input)\n",
    "\n",
    "            # TODO: implements the logics\n",
    "\n",
    "            # now stores the result\n",
    "            save_shared_memory(shared_memory, d_output, d_extras)\n",
    "\n",
    "        return cuda.jit(gpu_scan_block)\n",
    "\n",
    "    def __init__(self, functor):\n",
    "        \"\"\"\n",
    "        :param functor: A function implementing a binary operation for\n",
    "                        scan. It will be compiled as a CUDA device\n",
    "                        function using ``cuda.jit(device=True)``.\n",
    "        \"\"\"\n",
    "        self._functor = functor\n",
    "\n",
    "    def _compile_block_scan(self, dtype):\n",
    "        key = self._functor, dtype\n",
    "        if key not in self._kernels_block_scan:\n",
    "            self._kernels_block_scan[key] = InclusiveScan._gpu_kernel_block_scan_factory(self._functor,\n",
    "                                                                                         from_dtype(dtype))\n",
    "        return self._kernels_block_scan[key]\n",
    "\n",
    "    @staticmethod\n",
    "    def _gpu_kernel_block_map_factory(fn, np_type):\n",
    "        \"\"\"Factory of kernels for the block-map problem...\n",
    "\n",
    "        This function returns a Cuda Kernel that does block-map of some data using a given binary functor.\"\"\"\n",
    "\n",
    "        scan_op = cuda.jit(device=True)(fn)\n",
    "\n",
    "        def gpu_map_block(d_io, d_scan):\n",
    "            \"\"\"\n",
    "            Per block MAP...\n",
    "            \"\"\"\n",
    "            # TODO: implements the logics\n",
    "\n",
    "        return cuda.jit(gpu_map_block)\n",
    "\n",
    "    def _compile_block_map(self, dtype):\n",
    "        key = self._functor, dtype\n",
    "        if key not in self._kernels_block_map:\n",
    "            self._kernels_block_map[key] = InclusiveScan._gpu_kernel_block_map_factory(self._functor, from_dtype(dtype))\n",
    "        return self._kernels_block_map[key]\n",
    "\n",
    "    def __call__(self, d_input, d_output, verbose=False, stream=cuda.default_stream()):\n",
    "        \"\"\" Performs a per-block SCAN.\n",
    "\n",
    "        :param d_input: A host or device array.\n",
    "        :param stream: Optional CUDA stream in which to perform the scan.\n",
    "                    If no stream is specified, the default stream of 0 is used.\n",
    "        \"\"\"\n",
    "\n",
    "        # ensure 1d array\n",
    "        if d_input.ndim != 1:\n",
    "            raise TypeError(\"only support 1D array\")\n",
    "\n",
    "        # ensure size > 0\n",
    "        if d_input.size < 1:\n",
    "            raise ValueError(\"array's length is 0\")\n",
    "\n",
    "        # ensure arrays' size are the same\n",
    "        if d_input.size != d_output.size:\n",
    "            raise ValueError(\"arrays' length are different ({d_input.size} / {d_output.size}\")\n",
    "\n",
    "        _sav, core.config.CUDA_LOW_OCCUPANCY_WARNINGS = core.config.CUDA_LOW_OCCUPANCY_WARNINGS, False\n",
    "\n",
    "        kernel_step1 = self._compile_block_scan(d_input.dtype)\n",
    "        kernel_step2 = self._compile_block_map(d_input.dtype)\n",
    "\n",
    "        max_block_size = InclusiveScan._WARP_SIZE * InclusiveScan._NUM_WARPS\n",
    "        nb_threads = min(max_block_size, d_input.size)\n",
    "        nb_blocks = (d_input.size + nb_threads - 1) // nb_threads\n",
    "        extras = cuda.device_array(shape=nb_blocks, dtype=d_input.dtype)\n",
    "\n",
    "        # Perform the reduction on the GPU\n",
    "        start_event = cuda.event(True)\n",
    "        start_event.record(stream=stream)\n",
    "\n",
    "        kernel_step1[nb_blocks, nb_threads, stream](d_input, d_output, extras)\n",
    "        if nb_blocks > 1:\n",
    "            # display(extras, all=True)\n",
    "            self(extras, extras, stream)\n",
    "            kernel_step2[nb_blocks, nb_threads, stream](d_output, extras)\n",
    "\n",
    "        stop_event = cuda.event(True)\n",
    "        stop_event.record(stream=stream)\n",
    "        stop_event.synchronize()\n",
    "\n",
    "        core.config.CUDA_LOW_OCCUPANCY_WARNINGS = _sav\n",
    "\n",
    "        # display(extras)\n",
    "\n",
    "        return cuda.event_elapsed_time(start_event, stop_event)\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "\n",
    "    def check(expected, result):\n",
    "        it_works = np.array_equal(expected, result)\n",
    "        if it_works:\n",
    "            print(f'- seems to work')\n",
    "        else:\n",
    "            print(f'- seems to not work, here is your result:')\n",
    "            display(result)\n",
    "            display(expected)\n",
    "\n",
    "\n",
    "    def test_int32(size):\n",
    "        scanner = InclusiveScan(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.int32)\n",
    "        d_result = cuda.device_array(size, dtype=np.int32)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            time = scanner(cuda.to_device(h_array), d_result, verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        expected = np.cumsum(h_array)\n",
    "        check(expected, d_result.copy_to_host())\n",
    "\n",
    "\n",
    "    def test_float32(size):\n",
    "        scanner = InclusiveScan(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.float32)\n",
    "        d_result = cuda.device_array(size, dtype=np.float32)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            time = scanner(cuda.to_device(h_array), d_result, verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        expected = np.cumsum(h_array)\n",
    "        check(expected, d_result.copy_to_host())\n",
    "\n",
    "\n",
    "    def test_float64(size):\n",
    "        scanner = InclusiveScan(lambda a, b: a + b)\n",
    "        h_array = np.ones(size, dtype=np.float64)\n",
    "        d_result = cuda.device_array(size, dtype=np.float64)\n",
    "        ct=[]\n",
    "        verbose = True\n",
    "        for _ in range(15):\n",
    "            time = scanner(cuda.to_device(h_array), d_result, verbose)\n",
    "            verbose = False\n",
    "            ct.append(time)\n",
    "        print(f\"minimum kernel computation time is {min(ct)} ms\")\n",
    "        expected = np.cumsum(h_array)\n",
    "        check(expected, d_result.copy_to_host())\n",
    "\n",
    "\n",
    "    test_int32(1 << 24)\n",
    "    # with float32, the sequential cumsum fails with more than 2^24 number...\n",
    "    test_float32(1 << 24)\n",
    "    test_float64(1 << 24)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "9b4d75ac280b6c7c3aa43866cb82dc88915409b55fec83a093dd0284cb58708e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
