From f42943b24f2f65c5b9695db3243e2563064642b0 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Wed, 12 Apr 2023 12:33:42 +0200
Subject: [PATCH 1/5] Updated Readme

---
 README.md | 257 ++++++++++++++++++++++++------------------------------
 1 file changed, 112 insertions(+), 145 deletions(-)
diff --git a/README.md b/README.md
index 21586ac8..7cfbfd9d 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 The fastest and most memory efficient lattice Boltzmann CFD software, running on all GPUs via [OpenCL](https://github.com/ProjectPhysX/OpenCL-Wrapper "OpenCL-Wrapper").
 
 <a href="https://youtu.be/o3TPN142HxM"><img src="https://img.youtube.com/vi/o3TPN142HxM/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/oC6U1M0Fsug"><img src="https://img.youtube.com/vi/oC6U1M0Fsug/maxresdefault.jpg" width="50%"></img></a><br>
-<a href="https://youtu.be/NQPgumd3Ei8"><img src="https://img.youtube.com/vi/NQPgumd3Ei8/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/aqG8qZ_Gc4U"><img src="https://img.youtube.com/vi/aqG8qZ_Gc4U/maxresdefault.jpg" width="50%"></img></a>
+<a href="https://youtu.be/XOfXHgP4jnQ"><img src="https://img.youtube.com/vi/XOfXHgP4jnQ/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/aqG8qZ_Gc4U"><img src="https://img.youtube.com/vi/aqG8qZ_Gc4U/maxresdefault.jpg" width="50%"></img></a>
 
 
 <details><summary>Update History</summary>
@@ -61,10 +61,13 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
 
 ## Compute Features
 
-- CFD model: lattice Boltzmann method (LBM)
-  <details><summary>&#9900; &nbsp;streaming (part 2/2)</summary><p align="center"><i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>)<br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>(<i>t</i>%2 ? <i>i</i> : (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1))</sub>(<i>i</i>%2 ? <i>x</i> : <i>x</i>-<i>e<sub>i</sub></i>, <i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p></details>
-  <details><summary>&#9900; &nbsp;collision</summary><p align="center"><i>&rho;</i>(<i>x</i>,<i>t</i>) = (&Sigma;<sub><i>i</i></sub> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)) + 1<br><br><i>u</i>(<i>x</i>,<i>t</i>) = <sup>1</sup>&#8725;<sub><i>&rho;</i>(<i>x</i>,<i>t</i>)</sub> &Sigma;<sub><i>i</i></sub> <i>c<sub>i</sub></i> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)<br><br><i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>) = <i>w<sub>i</sub></i> <i>&rho;</i> · (<sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)<sup>2</sup></sup>&#8725;<sub>(2<i>c</i><sup>4</sup>)</sub> - <sup>(<i>u</i><sub>°</sub><i>u</i>)</sup>&#8725;<sub>(2c<sup>2</sup>)</sub> + <sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)</sup>&#8725;<sub><i>c</i><sup>2</sup></sub>) + <i>w<sub>i</sub></i> (<i>&rho;</i>-1)<br><br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) + <i>&Omega;<sub>i</sub></i>(<i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>), <i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>), <i>&tau;</i>)</p></details>
-  <details><summary>&#9900; &nbsp;streaming (part 1/2)</summary><p align="center"><i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>)<br><i>f</i><sub>(<i>t</i>%2 ? (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1) : <i>i</i>)</sub>(<i>i</i>%2 ? <i>x</i>+<i>e<sub>i</sub></i> : <i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p></details>
+- <details><summary>CFD model: lattice Boltzmann method (LBM)</summary>
+
+  - streaming (part 2/2)<p align="center"><i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>)<br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>(<i>t</i>%2 ? <i>i</i> : (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1))</sub>(<i>i</i>%2 ? <i>x</i> : <i>x</i>-<i>e<sub>i</sub></i>, <i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
+  - collision<p align="center"><i>&rho;</i>(<i>x</i>,<i>t</i>) = (&Sigma;<sub><i>i</i></sub> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)) + 1<br><br><i>u</i>(<i>x</i>,<i>t</i>) = <sup>1</sup>&#8725;<sub><i>&rho;</i>(<i>x</i>,<i>t</i>)</sub> &Sigma;<sub><i>i</i></sub> <i>c<sub>i</sub></i> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)<br><br><i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>) = <i>w<sub>i</sub></i> <i>&rho;</i> · (<sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)<sup>2</sup></sup>&#8725;<sub>(2<i>c</i><sup>4</sup>)</sub> - <sup>(<i>u</i><sub>°</sub><i>u</i>)</sup>&#8725;<sub>(2c<sup>2</sup>)</sub> + <sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)</sup>&#8725;<sub><i>c</i><sup>2</sup></sub>) + <i>w<sub>i</sub></i> (<i>&rho;</i>-1)<br><br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) + <i>&Omega;<sub>i</sub></i>(<i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>), <i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>), <i>&tau;</i>)</p>
+  - streaming (part 1/2)<p align="center"><i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>)<br><i>f</i><sub>(<i>t</i>%2 ? (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1) : <i>i</i>)</sub>(<i>i</i>%2 ? <i>x</i>+<i>e<sub>i</sub></i> : <i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
+
+</details>
 
 <!-- markdown equations don't render properly in mobile browser
   - streaming (part 2/2):
@@ -82,128 +85,119 @@ $$j=t\\%2\\ ?\\ (i\\%2\\ ?\\ i+1\\ :\\ i-1)\\ :\\ i\\ \textrm{for}\\ i\in[1,q-1]
 $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{temp}(\vec{x},\\ t+\Delta t)$$
  -->
 
-- peak performance on GPUs (datacenter/gaming/professional/laptop), validated with roofline model
-- optimized to minimize memory demand:
-  - traditional LBM (D3Q19) with FP64 requires ~344 Bytes/cell
-    ```
-    🟧🟧🟧🟧🟧🟧🟧🟧🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
-    🟨🟨🟨🟨🟨🟨🟨🟨🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
-    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
-    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
-    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
-    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
-    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
-    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
-    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
-    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
-    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
-
-    (density 🟧, velocity 🟦, flags 🟨, 2 copies of DDFs 🟩/🟥; each square = 1 Byte)
-    ```
-    - allows for 3 Million cells per 1 GB VRAM
-  - FluidX3D (D3Q19) requires only 55 Bytes/cell with [Esoteric-Pull](https://doi.org/10.3390/computation10060092)+[FP16](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats)
-    ```
-    🟧🟧🟧🟧🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟨🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
-    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+- <details><summary>optimized to minimize VRAM footprint to 1/6 of other LBM codes</summary>
 
-    (density 🟧, velocity 🟦, flags 🟨, DDFs 🟩; each square = 1 Byte)
-    ```
+  - traditional LBM (D3Q19) with FP64 requires ~344 Bytes/cell<br>
+    - 🟧🟧🟧🟧🟧🟧🟧🟧🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟨🟨🟨🟨🟨🟨🟨🟨🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥<br>(density 🟧, velocity 🟦, flags 🟨, 2 copies of DDFs 🟩/🟥; each square = 1 Byte)
+    - allows for 3 Million cells per 1 GB VRAM
+  - FluidX3D (D3Q19) requires only 55 Bytes/cell with [Esoteric-Pull](https://doi.org/10.3390/computation10060092)+[FP16](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats)<br>
+    - 🟧🟧🟧🟧🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟨🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩<br>(density 🟧, velocity 🟦, flags 🟨, DDFs 🟩; each square = 1 Byte)
     - allows for 19 Million cells per 1 GB VRAM
     - in-place streaming with [Esoteric-Pull](https://doi.org/10.3390/computation10060092): eliminates redundant copy `B` of density distribution functions (DDFs) in memory; almost cuts memory demand in half and slightly increases performance due to implicit bounce-back boundaries; offers optimal memory access patterns for single-cell in-place streaming
     - [decoupled arithmetic precision (FP32) and memory precision (FP32 or FP16S or FP16C)](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats): all arithmetic is done in FP32 for compatibility on all hardware, but DDFs in memory can be compressed to FP16S or FP16C: almost cuts memory demand in half again and almost doubles performance, without impacting overall accuracy for most setups
-- multi-GPU support on a single node (PC/laptop/server) via domain decomposition
-  - allows pooling VRAM from multiple GPUs for much larger grid resolution
+  - large cost saving: comparison of maximum single-GPU grid resolution for D3Q19 LBM
+
+    | GPU&nbsp;VRAM&nbsp;capacity      | 1&nbsp;GB | 2&nbsp;GB | 3&nbsp;GB | 4&nbsp;GB | 6&nbsp;GB | 8&nbsp;GB | 10&nbsp;GB | 11&nbsp;GB | 12&nbsp;GB | 16&nbsp;GB | 20&nbsp;GB | 24&nbsp;GB | 32&nbsp;GB | 40&nbsp;GB | 48&nbsp;GB | 64&nbsp;GB | 80&nbsp;GB | 94&nbsp;GB | 128&nbsp;GB | 192&nbsp;GB | 256&nbsp;GB |
+    | :------------------------------- | --------: | --------: | --------: | --------: | --------: | --------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ----------: | ----------: | ----------: |
+    | approximate&nbsp;GPU&nbsp;price  | $25<br>GT&nbsp;210 | $25<br>GTX&nbsp;950 | $12<br>GTX&nbsp;1060 | $50<br>GT&nbsp;730 | $35<br>GTX&nbsp;1060 | $70<br>RX&nbsp;470 | $500<br>RTX&nbsp;3080 | $240<br>GTX&nbsp;1080&nbsp;Ti | $75<br>Tesla&nbsp;M40 | $75<br>Instinct&nbsp;MI25 | $900<br>RX&nbsp;7900&nbsp;XT | $205<br>Tesla&nbsp;P40 | $600<br>Instinct&nbsp;MI60 | $5500<br>A100 | $2400<br>RTX&nbsp;8000 | $31k<br>Instinct&nbsp;MI210 | $11k<br>A100 | >$40k<br>H100&nbsp;NVL | ?<br>Max&nbsp;Series&nbsp;1550 | - | - |
+    | traditional&nbsp;LBM&nbsp;(FP64) |      144³ |      182³ |      208³ |      230³ |      262³ |      288³ |       312³ |       322³ |       330³ |       364³ |       392³ |       418³ |       460³ |       494³ |       526³ |       578³ |       624³ |       658³ |        730³ |        836³ |        920³ |
+    | FluidX3D&nbsp;(FP32/FP32)        |      224³ |      282³ |      322³ |      354³ |      406³ |      448³ |       482³ |       498³ |       512³ |       564³ |       608³ |       646³ |       710³ |       766³ |       814³ |       896³ |       966³ |      1018³ |       1130³ |       1292³ |       1422³ |
+    | FluidX3D&nbsp;(FP32/FP16)        |      266³ |      336³ |      384³ |      424³ |      484³ |      534³ |       574³ |       594³ |       610³ |       672³ |       724³ |       770³ |       848³ |       912³ |       970³ |      1068³ |      1150³ |      1214³ |       1346³ |       1540³ |       1624³ |
+
+  </details>
+- <details><summary>cross-vendor multi-GPU support on a single PC/laptop/server</summary>
+
+  - domain decomposition allows pooling VRAM from multiple GPUs for much larger grid resolution
   - each domain (GPU) can hold up to 4.29 billion (2³², 1624³) lattice points (225 GB memory)
   - GPUs don't have to be identical (not even from the same vendor), but similar VRAM capacity/bandwidth is recommended
-  <details><summary>&#9900; &nbsp;domain communication architecture (simplified)</summary>
-
-  ```diff
-  ++   .-----------------------------------------------------------------.   ++
-  ++   |                              GPU 0                              |   ++
-  ++   |                          LBM Domain 0                           |   ++
-  ++   '-----------------------------------------------------------------'   ++
-  ++              |                 selective                /|\             ++
-  ++             \|/               in-VRAM copy               |              ++
-  ++        .-------------------------------------------------------.        ++
-  ++        |               GPU 0 - Transfer Buffer 0               |        ++
-  ++        '-------------------------------------------------------'        ++
-  !!                            |     PCIe     /|\                           !!
-  !!                           \|/    copy      |                            !!
-  @@        .-------------------------.   .-------------------------.        @@
-  @@        | CPU - Transfer Buffer 0 |   | CPU - Transfer Buffer 1 |        @@
-  @@        '-------------------------'\ /'-------------------------'        @@
-  @@                           pointer  X   swap                             @@
-  @@        .-------------------------./ \.-------------------------.        @@
-  @@        | CPU - Transfer Buffer 1 |   | CPU - Transfer Buffer 0 |        @@
-  @@        '-------------------------'   '-------------------------'        @@
-  !!                           /|\    PCIe      |                            !!
-  !!                            |     copy     \|/                           !!
-  ++        .-------------------------------------------------------.        ++
-  ++        |               GPU 1 - Transfer Buffer 1               |        ++
-  ++        '-------------------------------------------------------'        ++
-  ++             /|\                selective                 |              ++
-  ++              |                in-VRAM copy              \|/             ++
-  ++   .-----------------------------------------------------------------.   ++
-  ++   |                              GPU 1                              |   ++
-  ++   |                          LBM Domain 1                           |   ++
-  ++   '-----------------------------------------------------------------'   ++
-  ##                                    |                                    ##
-  ##                      domain synchronization barrier                     ##
-  ##                                    |                                    ##
-  ||   -------------------------------------------------------------> time   ||
-  ```
-
-  </details><details><summary>&#9900; &nbsp;domain communication architecture (detailed)</summary>
-
-  ```diff
-  ++   .-----------------------------------------------------------------.   ++
-  ++   |                              GPU 0                              |   ++
-  ++   |                          LBM Domain 0                           |   ++
-  ++   '-----------------------------------------------------------------'   ++
-  ++     |  selective in- /|\  |  selective in- /|\  |  selective in- /|\    ++
-  ++    \|/ VRAM copy (X)  |  \|/ VRAM copy (Y)  |  \|/ VRAM copy (Z)  |     ++
-  ++   .---------------------.---------------------.---------------------.   ++
-  ++   |    GPU 0 - TB 0X+   |    GPU 0 - TB 0Y+   |    GPU 0 - TB 0Z+   |   ++
-  ++   |    GPU 0 - TB 0X-   |    GPU 0 - TB 0Y-   |    GPU 0 - TB 0Z-   |   ++
-  ++   '---------------------'---------------------'---------------------'   ++
-  !!          | PCIe /|\            | PCIe /|\            | PCIe /|\         !!
-  !!         \|/ copy |            \|/ copy |            \|/ copy |          !!
-  @@   .---------. .---------.---------. .---------.---------. .---------.   @@
-  @@   | CPU 0X+ | | CPU 1X- | CPU 0Y+ | | CPU 3Y- | CPU 0Z+ | | CPU 5Z- |   @@
-  @@   | CPU 0X- | | CPU 2X+ | CPU 0Y- | | CPU 4Y+ | CPU 0Z- | | CPU 6Z+ |   @@
-  @@   '---------\ /---------'---------\ /---------'---------\ /---------'   @@
-  @@      pointer X swap (X)    pointer X swap (Y)    pointer X swap (Z)     @@
-  @@   .---------/ \---------.---------/ \---------.---------/ \---------.   @@
-  @@   | CPU 1X- | | CPU 0X+ | CPU 3Y- | | CPU 0Y+ | CPU 5Z- | | CPU 0Z+ |   @@
-  @@   | CPU 2X+ | | CPU 0X- | CPU 4Y+ | | CPU 0Y- | CPU 6Z+ | | CPU 0Z- |   @@
-  @@   '---------' '---------'---------' '---------'---------' '---------'   @@
-  !!         /|\ PCIe |            /|\ PCIe |            /|\ PCIe |          !!
-  !!          | copy \|/            | copy \|/            | copy \|/         !!
-  ++   .--------------------..---------------------..--------------------.   ++
-  ++   |   GPU 1 - TB 1X-   ||    GPU 3 - TB 3Y-   ||   GPU 5 - TB 5Z-   |   ++
-  ++   :====================::=====================::====================:   ++
-  ++   |   GPU 2 - TB 2X+   ||    GPU 4 - TB 4Y+   ||   GPU 6 - TB 6Z+   |   ++
-  ++   '--------------------''---------------------''--------------------'   ++
-  ++    /|\ selective in-  |  /|\ selective in-  |  /|\ selective in-  |     ++
-  ++     |  VRAM copy (X) \|/  |  VRAM copy (Y) \|/  |  VRAM copy (Z) \|/    ++
-  ++   .--------------------..---------------------..--------------------.   ++
-  ++   |        GPU 1       ||        GPU 3        ||        GPU 5       |   ++
-  ++   |    LBM Domain 1    ||    LBM Domain 3     ||    LBM Domain 5    |   ++
-  ++   :====================::=====================::====================:   ++
-  ++   |        GPU 2       ||        GPU 4        ||        GPU 6       |   ++
-  ++   |    LBM Domain 2    ||    LBM Domain 4     ||    LBM Domain 6    |   ++
-  ++   '--------------------''---------------------''--------------------'   ++
-  ##              |                     |                     |              ##
-  ##              |      domain synchronization barriers      |              ##
-  ##              |                     |                     |              ##
-  ||   -------------------------------------------------------------> time   ||
-  ```
+  - domain communication architecture (simplified)
+    ```diff
+    ++   .-----------------------------------------------------------------.   ++
+    ++   |                              GPU 0                              |   ++
+    ++   |                          LBM Domain 0                           |   ++
+    ++   '-----------------------------------------------------------------'   ++
+    ++              |                 selective                /|\             ++
+    ++             \|/               in-VRAM copy               |              ++
+    ++        .-------------------------------------------------------.        ++
+    ++        |               GPU 0 - Transfer Buffer 0               |        ++
+    ++        '-------------------------------------------------------'        ++
+    !!                            |     PCIe     /|\                           !!
+    !!                           \|/    copy      |                            !!
+    @@        .-------------------------.   .-------------------------.        @@
+    @@        | CPU - Transfer Buffer 0 |   | CPU - Transfer Buffer 1 |        @@
+    @@        '-------------------------'\ /'-------------------------'        @@
+    @@                           pointer  X   swap                             @@
+    @@        .-------------------------./ \.-------------------------.        @@
+    @@        | CPU - Transfer Buffer 1 |   | CPU - Transfer Buffer 0 |        @@
+    @@        '-------------------------'   '-------------------------'        @@
+    !!                           /|\    PCIe      |                            !!
+    !!                            |     copy     \|/                           !!
+    ++        .-------------------------------------------------------.        ++
+    ++        |               GPU 1 - Transfer Buffer 1               |        ++
+    ++        '-------------------------------------------------------'        ++
+    ++             /|\                selective                 |              ++
+    ++              |                in-VRAM copy              \|/             ++
+    ++   .-----------------------------------------------------------------.   ++
+    ++   |                              GPU 1                              |   ++
+    ++   |                          LBM Domain 1                           |   ++
+    ++   '-----------------------------------------------------------------'   ++
+    ##                                    |                                    ##
+    ##                      domain synchronization barrier                     ##
+    ##                                    |                                    ##
+    ||   -------------------------------------------------------------> time   ||
+    ```
+  - domain communication architecture (detailed)
+    ```diff
+    ++   .-----------------------------------------------------------------.   ++
+    ++   |                              GPU 0                              |   ++
+    ++   |                          LBM Domain 0                           |   ++
+    ++   '-----------------------------------------------------------------'   ++
+    ++     |  selective in- /|\  |  selective in- /|\  |  selective in- /|\    ++
+    ++    \|/ VRAM copy (X)  |  \|/ VRAM copy (Y)  |  \|/ VRAM copy (Z)  |     ++
+    ++   .---------------------.---------------------.---------------------.   ++
+    ++   |    GPU 0 - TB 0X+   |    GPU 0 - TB 0Y+   |    GPU 0 - TB 0Z+   |   ++
+    ++   |    GPU 0 - TB 0X-   |    GPU 0 - TB 0Y-   |    GPU 0 - TB 0Z-   |   ++
+    ++   '---------------------'---------------------'---------------------'   ++
+    !!          | PCIe /|\            | PCIe /|\            | PCIe /|\         !!
+    !!         \|/ copy |            \|/ copy |            \|/ copy |          !!
+    @@   .---------. .---------.---------. .---------.---------. .---------.   @@
+    @@   | CPU 0X+ | | CPU 1X- | CPU 0Y+ | | CPU 3Y- | CPU 0Z+ | | CPU 5Z- |   @@
+    @@   | CPU 0X- | | CPU 2X+ | CPU 0Y- | | CPU 4Y+ | CPU 0Z- | | CPU 6Z+ |   @@
+    @@   '---------\ /---------'---------\ /---------'---------\ /---------'   @@
+    @@      pointer X swap (X)    pointer X swap (Y)    pointer X swap (Z)     @@
+    @@   .---------/ \---------.---------/ \---------.---------/ \---------.   @@
+    @@   | CPU 1X- | | CPU 0X+ | CPU 3Y- | | CPU 0Y+ | CPU 5Z- | | CPU 0Z+ |   @@
+    @@   | CPU 2X+ | | CPU 0X- | CPU 4Y+ | | CPU 0Y- | CPU 6Z+ | | CPU 0Z- |   @@
+    @@   '---------' '---------'---------' '---------'---------' '---------'   @@
+    !!         /|\ PCIe |            /|\ PCIe |            /|\ PCIe |          !!
+    !!          | copy \|/            | copy \|/            | copy \|/         !!
+    ++   .--------------------..---------------------..--------------------.   ++
+    ++   |   GPU 1 - TB 1X-   ||    GPU 3 - TB 3Y-   ||   GPU 5 - TB 5Z-   |   ++
+    ++   :====================::=====================::====================:   ++
+    ++   |   GPU 2 - TB 2X+   ||    GPU 4 - TB 4Y+   ||   GPU 6 - TB 6Z+   |   ++
+    ++   '--------------------''---------------------''--------------------'   ++
+    ++    /|\ selective in-  |  /|\ selective in-  |  /|\ selective in-  |     ++
+    ++     |  VRAM copy (X) \|/  |  VRAM copy (Y) \|/  |  VRAM copy (Z) \|/    ++
+    ++   .--------------------..---------------------..--------------------.   ++
+    ++   |        GPU 1       ||        GPU 3        ||        GPU 5       |   ++
+    ++   |    LBM Domain 1    ||    LBM Domain 3     ||    LBM Domain 5    |   ++
+    ++   :====================::=====================::====================:   ++
+    ++   |        GPU 2       ||        GPU 4        ||        GPU 6       |   ++
+    ++   |    LBM Domain 2    ||    LBM Domain 4     ||    LBM Domain 6    |   ++
+    ++   '--------------------''---------------------''--------------------'   ++
+    ##              |                     |                     |              ##
+    ##              |      domain synchronization barriers      |              ##
+    ##              |                     |                     |              ##
+    ||   -------------------------------------------------------------> time   ||
+    ```
 
   </details>
+- [peak performance on GPUs](#single-gpu-benchmarks) (datacenter/gaming/professional/laptop), validated with roofline model
 - [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) and other algebraic optimization to minimize round-off error
 - velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
 - collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
-- only 8 flag bits per lattice point (can be used independently / at the same time):
+- <details><summary>only 8 flag bits per lattice point (can be used independently / at the same time)</summary>
+
   - `TYPE_S` (stationary or moving) solid boundaries
   - `TYPE_E` equilibrium boundaries (inflow/outflow)
   - `TYPE_T` temperature boundaries
@@ -213,6 +207,8 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
   - `TYPE_X` remaining for custom use or further extensions
   - `TYPE_Y` remaining for custom use or further extensions
 
+  </details>
+
 
 
 ## Optional Compute Extensions
@@ -447,35 +443,6 @@ Multi-GPU benchmarks are done at the largest possible grid resolution with a cub
 
 
 
-## Maximum Single-Domain Grid Resolution for D3Q19 LBM
-
-| Memory | FP32/FP32 | FP32/FP16 |
-| -----: | --------: | --------: |
-|   1 GB |      224³ |      266³ |
-|   2 GB |      282³ |      336³ |
-|   3 GB |      322³ |      384³ |
-|   4 GB |      354³ |      424³ |
-|   6 GB |      406³ |      484³ |
-|   8 GB |      448³ |      534³ |
-|  10 GB |      482³ |      574³ |
-|  11 GB |      498³ |      594³ |
-|  12 GB |      512³ |      610³ |
-|  16 GB |      564³ |      672³ |
-|  20 GB |      608³ |      724³ |
-|  24 GB |      646³ |      770³ |
-|  32 GB |      710³ |      848³ |
-|  40 GB |      766³ |      912³ |
-|  48 GB |      814³ |      970³ |
-|  64 GB |      896³ |     1068³ |
-|  80 GB |      966³ |     1150³ |
-|  96 GB |     1026³ |     1222³ |
-| 128 GB |     1130³ |     1346³ |
-| 192 GB |     1292³ |     1540³ |
-| 256 GB |     1422³ |     1624³ |
-| 384 GB |     1624³ |     1624³ |
-
-
-
 ## FAQs
 
 ### General

From 0eec60819a4b67b626f7b9accfcbbcdb75c517c3 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Wed, 12 Apr 2023 12:45:56 +0200
Subject: [PATCH 2/5] Updated Readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7cfbfd9d..fb55c868 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,7 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 
     | GPU&nbsp;VRAM&nbsp;capacity      | 1&nbsp;GB | 2&nbsp;GB | 3&nbsp;GB | 4&nbsp;GB | 6&nbsp;GB | 8&nbsp;GB | 10&nbsp;GB | 11&nbsp;GB | 12&nbsp;GB | 16&nbsp;GB | 20&nbsp;GB | 24&nbsp;GB | 32&nbsp;GB | 40&nbsp;GB | 48&nbsp;GB | 64&nbsp;GB | 80&nbsp;GB | 94&nbsp;GB | 128&nbsp;GB | 192&nbsp;GB | 256&nbsp;GB |
     | :------------------------------- | --------: | --------: | --------: | --------: | --------: | --------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ---------: | ----------: | ----------: | ----------: |
-    | approximate&nbsp;GPU&nbsp;price  | $25<br>GT&nbsp;210 | $25<br>GTX&nbsp;950 | $12<br>GTX&nbsp;1060 | $50<br>GT&nbsp;730 | $35<br>GTX&nbsp;1060 | $70<br>RX&nbsp;470 | $500<br>RTX&nbsp;3080 | $240<br>GTX&nbsp;1080&nbsp;Ti | $75<br>Tesla&nbsp;M40 | $75<br>Instinct&nbsp;MI25 | $900<br>RX&nbsp;7900&nbsp;XT | $205<br>Tesla&nbsp;P40 | $600<br>Instinct&nbsp;MI60 | $5500<br>A100 | $2400<br>RTX&nbsp;8000 | $31k<br>Instinct&nbsp;MI210 | $11k<br>A100 | >$40k<br>H100&nbsp;NVL | ?<br>Max&nbsp;Series&nbsp;1550 | - | - |
+    | approximate&nbsp;GPU&nbsp;price  | $25<br>GT&nbsp;210 | $25<br>GTX&nbsp;950 | $12<br>GTX&nbsp;1060 | $50<br>GT&nbsp;730 | $35<br>GTX&nbsp;1060 | $70<br>RX&nbsp;470 | $500<br>RTX&nbsp;3080 | $240<br>GTX&nbsp;1080&nbsp;Ti | $75<br>Tesla&nbsp;M40 | $75<br>Instinct&nbsp;MI25 | $900<br>RX&nbsp;7900&nbsp;XT | $205<br>Tesla&nbsp;P40 | $600<br>Instinct&nbsp;MI60 | $5500<br>A100 | $2400<br>RTX&nbsp;8000 | $10k<br>Instinct&nbsp;MI210 | $11k<br>A100 | >$40k<br>H100&nbsp;NVL | ?<br>Max&nbsp;Series&nbsp;1550 | - | - |
     | traditional&nbsp;LBM&nbsp;(FP64) |      144³ |      182³ |      208³ |      230³ |      262³ |      288³ |       312³ |       322³ |       330³ |       364³ |       392³ |       418³ |       460³ |       494³ |       526³ |       578³ |       624³ |       658³ |        730³ |        836³ |        920³ |
     | FluidX3D&nbsp;(FP32/FP32)        |      224³ |      282³ |      322³ |      354³ |      406³ |      448³ |       482³ |       498³ |       512³ |       564³ |       608³ |       646³ |       710³ |       766³ |       814³ |       896³ |       966³ |      1018³ |       1130³ |       1292³ |       1422³ |
     | FluidX3D&nbsp;(FP32/FP16)        |      266³ |      336³ |      384³ |      424³ |      484³ |      534³ |       574³ |       594³ |       610³ |       672³ |       724³ |       770³ |       848³ |       912³ |       970³ |      1068³ |      1150³ |      1214³ |       1346³ |       1540³ |       1624³ |

From 9d07709b55a75ed520c7f8deffc6b7fcbcf8bb55 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Thu, 13 Apr 2023 00:48:45 +0200
Subject: [PATCH 3/5] Cosmetics for benchmark tables Readme

---
 README.md | 245 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 127 insertions(+), 118 deletions(-)

diff --git a/README.md b/README.md
index fb55c868..d9db1f69 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 The fastest and most memory efficient lattice Boltzmann CFD software, running on all GPUs via [OpenCL](https://github.com/ProjectPhysX/OpenCL-Wrapper "OpenCL-Wrapper").
 
 <a href="https://youtu.be/o3TPN142HxM"><img src="https://img.youtube.com/vi/o3TPN142HxM/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/oC6U1M0Fsug"><img src="https://img.youtube.com/vi/oC6U1M0Fsug/maxresdefault.jpg" width="50%"></img></a><br>
-<a href="https://youtu.be/XOfXHgP4jnQ"><img src="https://img.youtube.com/vi/XOfXHgP4jnQ/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/aqG8qZ_Gc4U"><img src="https://img.youtube.com/vi/aqG8qZ_Gc4U/maxresdefault.jpg" width="50%"></img></a>
+<a href="https://youtu.be/XOfXHgP4jnQ"><img src="https://img.youtube.com/vi/XOfXHgP4jnQ/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/3JNVBQyetMA"><img src="https://img.youtube.com/vi/3JNVBQyetMA/maxresdefault.jpg" width="50%"></img></a>
 
 
 <details><summary>Update History</summary>
@@ -316,90 +316,97 @@ In consequence, the arithmetic intensity of this implementation is 2.37 (FP32/FP
 
 If your GPU is not on the list yet, you can report your benchmarks [here](https://github.com/ProjectPhysX/FluidX3D/issues/8).
 
-| Device                        | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
-| :---------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
-| AMD Instinct MI250 (1 GCD)    |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
-| AMD Radeon VII                |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
-| Nvidia H100 PCIe 80GB         |              51.01 |          80 |         2000 |            11128 (85%) |             20624 (79%) |             13862 (53%) |
-| Nvidia A100 SXM4 80GB         |              19.49 |          80 |         2039 |            10228 (77%) |             18448 (70%) |             11197 (42%) |
-| Nvidia A100 SXM4 40GB         |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
-| Nvidia A100 PCIe 40GB         |              19.49 |          40 |         1555 |             8526 (84%) |             16035 (79%) |             11088 (55%) |
-| Nvidia Tesla V100 16GB        |              14.13 |          16 |          900 |             5128 (87%) |             10325 (88%) |              7683 (66%) |
-| Nvidia Quadro GV100           |              16.66 |          32 |          870 |             3442 (61%) |              6641 (59%) |              5863 (52%) |
-| Nvidia Tesla P100 16GB        |               9.52 |          16 |          732 |             3295 (69%) |              5950 (63%) |              4176 (44%) |
-| Nvidia Tesla P100 12GB        |               9.52 |          12 |          549 |             2427 (68%) |              4141 (58%) |              3999 (56%) |
-| Nvidia Tesla K40m             |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
-| Nvidia Tesla K80  (1 GPU)     |               4.11 |          12 |          240 |              916 (58%) |              1642 (53%) |               943 (30%) |
-| Nvidia Tesla K20c             |               3.52 |           5 |          208 |              861 (63%) |              1507 (56%) |               720 (27%) |
-| AMD Radeon RX 7900 XTX        |              61.44 |          24 |          960 |             3665 (58%) |              7644 (61%) |              7716 (62%) |
-| AMD Radeon RX 7900 XT         |              51.61 |          20 |          800 |             3013 (58%) |              5856 (56%) |              5986 (58%) |
-| AMD Radeon RX 6900 XT         |              23.04 |          16 |          512 |             1968 (59%) |              4227 (64%) |              4207 (63%) |
-| AMD Radeon RX 6800 XT         |              20.74 |          16 |          512 |             2008 (60%) |              4241 (64%) |              4224 (64%) |
-| AMD Radeon RX 5700 XT         |               9.75 |           8 |          448 |             1368 (47%) |              3253 (56%) |              3049 (52%) |
-| AMD Radeon RX Vega 64         |              13.35 |           8 |          484 |             1875 (59%) |              2878 (46%) |              3227 (51%) |
-| AMD Radeon RX 580 4GB         |               6.50 |           4 |          256 |              946 (57%) |              1848 (56%) |              1577 (47%) |
-| AMD Radeon HD 7850            |               1.84 |           2 |          154 |              112 (11%) |               120 ( 6%) |               635 (32%) |
-| Intel Arc A770 LE             |              19.66 |          16 |          560 |             2741 (75%) |              4591 (63%) |              4626 (64%) |
-| Intel Arc A750 LE             |              17.20 |           8 |          512 |             2625 (78%) |              4184 (63%) |              4238 (64%) |
-| Nvidia GeForce RTX 4090       |              82.58 |          24 |         1008 |             5624 (85%) |             11091 (85%) |             11496 (88%) |
-| Nvidia GeForce RTX 4080       |              55.45 |          16 |          717 |             3914 (84%) |              7626 (82%) |              7933 (85%) |
-| Nvidia GeForce RTX 3090 Ti    |              40.00 |          24 |         1008 |             5717 (87%) |             10956 (84%) |             10400 (79%) |
-| Nvidia GeForce RTX 3090       |              39.05 |          24 |          936 |             5418 (89%) |             10732 (88%) |             10215 (84%) |
-| Nvidia GeForce RTX 3080 Ti    |              37.17 |          12 |          912 |             5202 (87%) |              9832 (87%) |              9347 (79%) |
-| Nvidia RTX A6000              |              40.00 |          48 |          768 |             4421 (88%) |              8814 (88%) |              8533 (86%) |
-| Nvidia GeForce RTX 3080       |              29.77 |          10 |          760 |             4230 (85%) |              8118 (82%) |              7714 (78%) |
-| Nvidia GeForce RTX 3070       |              20.31 |           8 |          448 |             2578 (88%) |              5096 (88%) |              5060 (87%) |
-| Nvidia GeForce RTX 3060 Ti    |              16.49 |           8 |          448 |             2644 (90%) |              5129 (88%) |              4718 (81%) |
-| Nvidia RTX A5000M             |              16.59 |          16 |          448 |             2228 (76%) |              4461 (77%) |              3662 (63%) |
-| Nvidia GeForce RTX 3060       |              13.17 |          12 |          360 |             2108 (90%) |              4070 (87%) |              3566 (76%) |
-| Nvidia GeForce RTX 3060M      |              10.94 |           6 |          336 |             2019 (92%) |              4012 (92%) |              3572 (82%) |
-| Nvidia GeForce RTX 3050M      |               7.13 |           4 |          192 |             1180 (94%) |              2339 (94%) |              2016 (81%) |
-| Nvidia Quadro RTX 6000        |              16.31 |          24 |          672 |             3307 (75%) |              6836 (78%) |              6879 (79%) |
-| Nvidia Quadro RTX 8000 Pass.  |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
-| Nvidia GeForce RTX 2080 Ti    |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
-| Nvidia GeForce RTX 2080 Sup.  |              11.34 |           8 |          496 |             2434 (75%) |              5284 (82%) |              5087 (79%) |
-| Nvidia Quadro RTX 5000        |              11.15 |          16 |          448 |             2341 (80%) |              4766 (82%) |              4773 (82%) |
-| Nvidia GeForce RTX 2060 Sup.  |               7.18 |           8 |          448 |             2503 (85%) |              5035 (87%) |              4463 (77%) |
-| Nvidia Quadro RTX 4000        |               7.12 |           8 |          416 |             2284 (84%) |              4584 (85%) |              4062 (75%) |
-| Nvidia GeForce RTX 2060 KO    |               6.74 |           6 |          336 |             1643 (75%) |              3376 (77%) |              3266 (75%) |
-| Nvidia GeForce RTX 2060       |               6.74 |           6 |          336 |             1681 (77%) |              3604 (83%) |              3571 (82%) |
-| Nvidia GeForce GTX 1660 Sup.  |               5.03 |           6 |          336 |             1696 (77%) |              3551 (81%) |              3040 (70%) |
-| Nvidia Tesla T4               |               8.14 |          15 |          300 |             1356 (69%) |              2869 (74%) |              2887 (74%) |
-| Nvidia GeForce GTX 1660 Ti    |               5.48 |           6 |          288 |             1467 (78%) |              3041 (81%) |              3019 (81%) |
-| Nvidia GeForce GTX 1660       |               5.07 |           6 |          192 |             1016 (81%) |              1924 (77%) |              1992 (80%) |
-| Nvidia GeForce GTX 1650M      |               3.20 |           4 |          128 |              706 (84%) |              1214 (73%) |              1400 (84%) |
-| Nvidia Titan Xp               |              12.15 |          12 |          548 |             2919 (82%) |              5495 (77%) |              5375 (76%) |
-| Nvidia GeForce GTX 1080 Ti    |              12.06 |          11 |          484 |             2631 (83%) |              4837 (77%) |              4877 (78%) |
-| Nvidia GeForce GTX 1080       |               9.78 |           8 |          320 |             1623 (78%) |              3100 (75%) |              3182 (77%) |
-| Nvidia GeForce GTX 1060M      |               4.44 |           6 |          192 |              983 (78%) |              1882 (75%) |              1803 (72%) |
-| Nvidia GeForce GTX 1050M Ti   |               2.49 |           4 |          112 |              631 (86%) |              1224 (84%) |              1115 (77%) |
-| Nvidia Quadro P1000           |               1.89 |           4 |           82 |              426 (79%) |               839 (79%) |               778 (73%) |
-| Nvidia GeForce GTX 970        |               4.17 |           4 |          224 |              980 (67%) |              1721 (59%) |              1623 (56%) |
-| Nvidia Quadro M4000           |               2.57 |           8 |          192 |              899 (72%) |              1519 (61%) |              1050 (42%) |
-| Nvidia Tesla M60 (1 GPU)      |               4.82 |           8 |          160 |              853 (82%) |              1571 (76%) |              1557 (75%) |
-| Nvidia GeForce GTX 960M       |               1.51 |           4 |           80 |              442 (84%) |               872 (84%) |               627 (60%) |
-| Nvidia Quadro K2000           |               0.73 |           2 |           64 |              312 (75%) |               444 (53%) |               171 (21%) |
-| Nvidia GeForce GT 630 (OEM)   |               0.46 |           2 |           29 |              151 (81%) |               185 (50%) |                78 (21%) |
-| Nvidia Quadro NVS 290         |               0.03 |       0.256 |            6 |                1 ( 2%) |                 1 ( 1%) |                 1 ( 1%) |
-| Apple M1 Pro GPU 16C 16GB     |               4.10 |          11 |          200 |             1204 (92%) |              2329 (90%) |              1855 (71%) |
-| AMD Radeon Vega 8 (4750G)     |               2.15 |          27 |           57 |              263 (71%) |               511 (70%) |               501 (68%) |
-| AMD Radeon Vega 8 (3500U)     |               1.23 |           7 |           38 |              157 (63%) |               282 (57%) |               288 (58%) |
-| Intel UHD Graphics 630        |               0.46 |           7 |           51 |              151 (45%) |               301 (45%) |               187 (28%) |
-| Intel HD Graphics 5500        |               0.35 |           3 |           26 |               75 (45%) |               192 (58%) |               108 (32%) |
-| Intel HD Graphics 4600        |               0.38 |           2 |           26 |              105 (63%) |               115 (35%) |                34 (10%) |
-| Samsung ARM Mali-G72 MP18     |               0.24 |           4 |           29 |               14 ( 7%) |                17 ( 5%) |                12 ( 3%) |
-| 2x AMD EPYC 9654              |              29.49 |        1536 |          922 |             1381 (23%) |              1814 (15%) |              1801 (15%) |
-| Intel Xeon Phi 7210           |               5.32 |         192 |          102 |              415 (62%) |               193 (15%) |               223 (17%) |
-| 4x Intel Xeon E5-4620 v4      |               2.69 |         512 |          273 |              460 (26%) |               275 ( 8%) |               239 ( 7%) |
-| 2x Intel Xeon E5-2630 v4      |               1.41 |          64 |          137 |              264 (30%) |               146 ( 8%) |               129 ( 7%) |
-| 2x Intel Xeon E5-2623 v4      |               0.67 |          64 |          137 |              125 (14%) |                66 ( 4%) |                59 ( 3%) |
-| 2x Intel Xeon E5-2680 v3      |               1.92 |          64 |          137 |              209 (23%) |               305 (17%) |               281 (16%) |
-| Intel Core i9-10980XE         |               3.23 |         128 |           94 |              286 (47%) |               251 (21%) |               223 (18%) |
-| Intel Core i5-9600            |               0.60 |          16 |           43 |              146 (52%) |               127 (23%) |               147 (27%) |
-| Intel Core i7-8700K           |               0.71 |          16 |           51 |              152 (45%) |               134 (20%) |               116 (17%) |
-| Intel Core i7-7700HQ          |               0.36 |          12 |           38 |               81 (32%) |                82 (16%) |               108 (22%) |
-| Intel Core i7-4770            |               0.44 |          16 |           26 |              104 (62%) |                69 (21%) |                59 (18%) |
-| Intel Core i7-4720HQ          |               0.33 |          16 |           26 |               58 (35%) |                13 ( 4%) |                47 (14%) |
+Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
+
+| Device                                         | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
+| :--------------------------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
+|                                                |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;Instinct&nbsp;MI250&nbsp;(1&nbsp;GCD)  |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
+| 🔴&nbsp;Radeon&nbsp;VII                        |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
+| 🟢&nbsp;H100&nbsp;PCIe&nbsp;80GB                |              51.01 |          80 |         2000 |       11128&nbsp;(85%) |             20624 (79%) |             13862 (53%) |
+| 🟢&nbsp;A100&nbsp;SXM4&nbsp;80GB                |              19.49 |          80 |         2039 |       10228&nbsp;(77%) |             18448 (70%) |             11197 (42%) |
+| 🟢&nbsp;A100&nbsp;SXM4&nbsp;40GB                |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
+| 🟢&nbsp;A100&nbsp;PCIe&nbsp;40GB                |              19.49 |          40 |         1555 |             8526 (84%) |             16035 (79%) |             11088 (55%) |
+| 🟢&nbsp;Tesla&nbsp;V100&nbsp;16GB               |              14.13 |          16 |          900 |             5128 (87%) |             10325 (88%) |              7683 (66%) |
+| 🟢&nbsp;Quadro&nbsp;GV100                       |              16.66 |          32 |          870 |             3442 (61%) |              6641 (59%) |              5863 (52%) |
+| 🟢&nbsp;Tesla&nbsp;P100&nbsp;16GB               |               9.52 |          16 |          732 |             3295 (69%) |              5950 (63%) |              4176 (44%) |
+| 🟢&nbsp;Tesla&nbsp;P100&nbsp;12GB               |               9.52 |          12 |          549 |             2427 (68%) |              4141 (58%) |              3999 (56%) |
+| 🟢&nbsp;Tesla&nbsp;K40m                         |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
+| 🟢&nbsp;Tesla&nbsp;K80&nbsp;(1&nbsp;GPU)        |               4.11 |          12 |          240 |              916 (58%) |              1642 (53%) |               943 (30%) |
+| 🟢&nbsp;Tesla&nbsp;K20c                         |               3.52 |           5 |          208 |              861 (63%) |              1507 (56%) |               720 (27%) |
+|                                                |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;7900&nbsp;XTX      |              61.44 |          24 |          960 |             3665 (58%) |              7644 (61%) |              7716 (62%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;7900&nbsp;XT       |              51.61 |          20 |          800 |             3013 (58%) |              5856 (56%) |              5986 (58%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;6900&nbsp;XT       |              23.04 |          16 |          512 |             1968 (59%) |              4227 (64%) |              4207 (63%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;6800&nbsp;XT       |              20.74 |          16 |          512 |             2008 (60%) |              4241 (64%) |              4224 (64%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;5700&nbsp;XT       |               9.75 |           8 |          448 |             1368 (47%) |              3253 (56%) |              3049 (52%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;Vega&nbsp;64       |              13.35 |           8 |          484 |             1875 (59%) |              2878 (46%) |              3227 (51%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;580&nbsp;4GB       |               6.50 |           4 |          256 |              946 (57%) |              1848 (56%) |              1577 (47%) |
+| 🔴&nbsp;Radeon&nbsp;HD&nbsp;7850               |               1.84 |           2 |          154 |              112 (11%) |               120 ( 6%) |               635 (32%) |
+| 🔵&nbsp;Arc&nbsp;A770&nbsp;LE                  |              19.66 |          16 |          560 |             2741 (75%) |              4591 (63%) |              4626 (64%) |
+| 🔵&nbsp;Arc&nbsp;A750&nbsp;LE                  |              17.20 |           8 |          512 |             2625 (78%) |              4184 (63%) |              4238 (64%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;4090              |              82.58 |          24 |         1008 |             5624 (85%) |             11091 (85%) |             11496 (88%) |
+| 🟢&nbsp;RTX&nbsp;6000&nbsp;Ada                  |              91.10 |          48 |          960 |             4997 (80%) |             10249 (82%) |             10293 (83%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;4080              |              55.45 |          16 |          717 |             3914 (84%) |              7626 (82%) |              7933 (85%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3090&nbsp;Ti      |              40.00 |          24 |         1008 |             5717 (87%) |             10956 (84%) |             10400 (79%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3090              |              39.05 |          24 |          936 |             5418 (89%) |             10732 (88%) |             10215 (84%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3080&nbsp;Ti      |              37.17 |          12 |          912 |             5202 (87%) |              9832 (87%) |              9347 (79%) |
+| 🟢&nbsp;RTX&nbsp;A6000                          |              40.00 |          48 |          768 |             4421 (88%) |              8814 (88%) |              8533 (86%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3080              |              29.77 |          10 |          760 |             4230 (85%) |              8118 (82%) |              7714 (78%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3070              |              20.31 |           8 |          448 |             2578 (88%) |              5096 (88%) |              5060 (87%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3060&nbsp;Ti      |              16.49 |           8 |          448 |             2644 (90%) |              5129 (88%) |              4718 (81%) |
+| 🟢&nbsp;RTX&nbsp;A5000M                         |              16.59 |          16 |          448 |             2228 (76%) |              4461 (77%) |              3662 (63%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3060              |              13.17 |          12 |          360 |             2108 (90%) |              4070 (87%) |              3566 (76%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3060M             |              10.94 |           6 |          336 |             2019 (92%) |              4012 (92%) |              3572 (82%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050M             |               7.13 |           4 |          192 |             1180 (94%) |              2339 (94%) |              2016 (81%) |
+| 🟢&nbsp;Quadro&nbsp;RTX&nbsp;6000               |              16.31 |          24 |          672 |             3307 (75%) |              6836 (78%) |              6879 (79%) |
+| 🟢&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Pass.    |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti      |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Sup.    |              11.34 |           8 |          496 |             2434 (75%) |              5284 (82%) |              5087 (79%) |
+| 🟢&nbsp;Quadro&nbsp;RTX&nbsp;5000               |              11.15 |          16 |          448 |             2341 (80%) |              4766 (82%) |              4773 (82%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;2060&nbsp;Sup.    |               7.18 |           8 |          448 |             2503 (85%) |              5035 (87%) |              4463 (77%) |
+| 🟢&nbsp;Quadro&nbsp;RTX&nbsp;4000               |               7.12 |           8 |          416 |             2284 (84%) |              4584 (85%) |              4062 (75%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;2060&nbsp;KO      |               6.74 |           6 |          336 |             1643 (75%) |              3376 (77%) |              3266 (75%) |
+| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;2060              |               6.74 |           6 |          336 |             1681 (77%) |              3604 (83%) |              3571 (82%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1660&nbsp;Sup.    |               5.03 |           6 |          336 |             1696 (77%) |              3551 (81%) |              3040 (70%) |
+| 🟢&nbsp;Tesla&nbsp;T4                           |               8.14 |          15 |          300 |             1356 (69%) |              2869 (74%) |              2887 (74%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1660&nbsp;Ti      |               5.48 |           6 |          288 |             1467 (78%) |              3041 (81%) |              3019 (81%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1660              |               5.07 |           6 |          192 |             1016 (81%) |              1924 (77%) |              1992 (80%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1650M             |               3.20 |           4 |          128 |              706 (84%) |              1214 (73%) |              1400 (84%) |
+| 🟢&nbsp;Titan&nbsp;Xp                           |              12.15 |          12 |          548 |             2919 (82%) |              5495 (77%) |              5375 (76%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1080&nbsp;Ti      |              12.06 |          11 |          484 |             2631 (83%) |              4837 (77%) |              4877 (78%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1080              |               9.78 |           8 |          320 |             1623 (78%) |              3100 (75%) |              3182 (77%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1060M             |               4.44 |           6 |          192 |              983 (78%) |              1882 (75%) |              1803 (72%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;1050M Ti          |               2.49 |           4 |          112 |              631 (86%) |              1224 (84%) |              1115 (77%) |
+| 🟢&nbsp;Quadro&nbsp;P1000                       |               1.89 |           4 |           82 |              426 (79%) |               839 (79%) |               778 (73%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;970               |               4.17 |           4 |          224 |              980 (67%) |              1721 (59%) |              1623 (56%) |
+| 🟢&nbsp;Quadro&nbsp;M4000                       |               2.57 |           8 |          192 |              899 (72%) |              1519 (61%) |              1050 (42%) |
+| 🟢&nbsp;Tesla&nbsp;M60&nbsp;(1&nbsp;GPU)        |               4.82 |           8 |          160 |              853 (82%) |              1571 (76%) |              1557 (75%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;960M              |               1.51 |           4 |           80 |              442 (84%) |               872 (84%) |               627 (60%) |
+| 🟢&nbsp;Quadro&nbsp;K2000                       |               0.73 |           2 |           64 |              312 (75%) |               444 (53%) |               171 (21%) |
+| 🟢&nbsp;GeForce&nbsp;GT&nbsp;630&nbsp;(OEM)     |               0.46 |           2 |           29 |              151 (81%) |               185 (50%) |                78 (21%) |
+| 🟢&nbsp;Quadro&nbsp;NVS&nbsp;290                |               0.03 |       0.256 |            6 |                1 ( 2%) |                 1 ( 1%) |                 1 ( 1%) |
+|                                                |                    |             |              |                        |                         |                         |
+| 🟣&nbsp;M1&nbsp;Pro&nbsp;GPU&nbsp;16C&nbsp;16GB |               4.10 |          11 |          200 |             1204 (92%) |              2329 (90%) |              1855 (71%) |
+| 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(4750G)  |               2.15 |          27 |           57 |              263 (71%) |               511 (70%) |               501 (68%) |
+| 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(3500U)  |               1.23 |           7 |           38 |              157 (63%) |               282 (57%) |               288 (58%) |
+| 🔵&nbsp;UHD&nbsp;Graphics&nbsp;630            |               0.46 |           7 |           51 |              151 (45%) |               301 (45%) |               187 (28%) |
+| 🔵&nbsp;HD&nbsp;Graphics&nbsp;5500            |               0.35 |           3 |           26 |               75 (45%) |               192 (58%) |               108 (32%) |
+| 🔵&nbsp;HD&nbsp;Graphics&nbsp;4600            |               0.38 |           2 |           26 |              105 (63%) |               115 (35%) |                34 (10%) |
+| 🟡&nbsp;ARM&nbsp;Mali-G72&nbsp;MP18             |               0.24 |           4 |           29 |               14 ( 7%) |                17 ( 5%) |                12 ( 3%) |
+|                                                |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;2x&nbsp;EPYC&nbsp;9654                |              29.49 |        1536 |          922 |             1381 (23%) |              1814 (15%) |              1801 (15%) |
+| 🔵&nbsp;Xeon&nbsp;Phi&nbsp;7210               |               5.32 |         192 |          102 |              415 (62%) |               193 (15%) |               223 (17%) |
+| 🔵&nbsp;4x&nbsp;Xeon&nbsp;E5-4620&nbsp;v4     |               2.69 |         512 |          273 |              460 (26%) |               275 ( 8%) |               239 ( 7%) |
+| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2630&nbsp;v4     |               1.41 |          64 |          137 |              264 (30%) |               146 ( 8%) |               129 ( 7%) |
+| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2623&nbsp;v4     |               0.67 |          64 |          137 |              125 (14%) |                66 ( 4%) |                59 ( 3%) |
+| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2680&nbsp;v3     |               1.92 |          64 |          137 |              209 (23%) |               305 (17%) |               281 (16%) |
+| 🔵&nbsp;Core&nbsp;i9-10980XE                  |               3.23 |         128 |           94 |              286 (47%) |               251 (21%) |               223 (18%) |
+| 🔵&nbsp;Core&nbsp;i5-9600                     |               0.60 |          16 |           43 |              146 (52%) |               127 (23%) |               147 (27%) |
+| 🔵&nbsp;Core&nbsp;i7-8700K                    |               0.71 |          16 |           51 |              152 (45%) |               134 (20%) |               116 (17%) |
+| 🔵&nbsp;Core&nbsp;i7-7700HQ                   |               0.36 |          12 |           38 |               81 (32%) |                82 (16%) |               108 (22%) |
+| 🔵&nbsp;Core&nbsp;i7-4770                     |               0.44 |          16 |           26 |              104 (62%) |                69 (21%) |                59 (18%) |
+| 🔵&nbsp;Core&nbsp;i7-4720HQ                   |               0.33 |          16 |           26 |               58 (35%) |                13 ( 4%) |                47 (14%) |
 
 
 
@@ -407,39 +414,41 @@ If your GPU is not on the list yet, you can report your benchmarks [here](https:
 
 Multi-GPU benchmarks are done at the largest possible grid resolution with a cubic domain, and either 2x1x1, 2x2x1 or 2x2x2 of these cubic domains together. The percentages in brackets are single-GPU roofline model efficiency, and the multiplicator numbers in brackets are scaling factors relative to benchmarked single-GPU performance.
 
-| Device                        | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
-| :---------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
-|                               |                    |             |              |                        |                         |                         |
-| 1x AMD Instinct MI250 (1 GCD) |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
-| 1x AMD Instinct MI250 (2 GCD) |              90.52 |         128 |         3277 |            9460 (1.7x) |            14313 (1.6x) |            17338 (2.0x) |
-| 2x AMD Instinct MI250 (4 GCD) |             181.04 |         256 |         6554 |           16925 (3.0x) |            29163 (3.2x) |            29627 (3.5x) |
-| 4x AMD Instinct MI250 (8 GCD) |             362.08 |         512 |        13107 |           27350 (4.9x) |            52258 (5.8x) |            53521 (6.3x) |
-|                               |                    |             |              |                        |                         |                         |
-| 1x AMD Radeon VII             |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
-| 2x AMD Radeon VII             |              27.66 |          32 |         2048 |            8113 (1.7x) |            15591 (2.0x) |            10352 (2.0x) |
-| 4x AMD Radeon VII             |              55.32 |          64 |         4096 |           12911 (2.6x) |            24273 (3.1x) |            17080 (3.2x) |
-| 8x AMD Radeon VII             |             110.64 |         128 |         8192 |           21946 (4.5x) |            30826 (4.0x) |            24572 (4.7x) |
-|                               |                    |             |              |                        |                         |                         |
-| 1x Nvidia A100 SXM4 40GB      |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
-| 2x Nvidia A100 SXM4 40GB      |              38.98 |          80 |         3110 |           13629 (1.6x) |            24620 (1.5x) |            18850 (1.7x) |
-| 4x Nvidia A100 SXM4 40GB      |              77.96 |         160 |         6220 |           17978 (2.1x) |            30604 (1.9x) |            30627 (2.7x) |
-|                               |                    |             |              |                        |                         |                         |
-| 1x Nvidia Tesla K40m          |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
-| 2x Nvidia Tesla K40m          |               8.58 |          24 |          577 |            1971 (1.7x) |             3300 (1.8x) |             1801 (2.0x) |
-| 3x Tesla K40m + 1x Titan Xp   |              17.16 |          48 |         1154 |            3117 (2.8x) |             5174 (2.8x) |             3127 (3.4x) |
-|                               |                    |             |              |                        |                         |                         |
-| 1x Nvidia RTX A6000           |              40.00 |          48 |          768 |             4421 (88%) |              8814 (88%) |              8533 (86%) |
-| 2x Nvidia RTX A6000           |              80.00 |          96 |         1536 |            8041 (1.8x) |            15026 (1.7x) |            14795 (1.7x) |
-| 4x Nvidia RTX A6000           |             160.00 |         192 |         3072 |           14314 (3.2x) |            27915 (3.2x) |            27227 (3.2x) |
-| 8x Nvidia RTX A6000           |             320.00 |         384 |         6144 |           19311 (4.4x) |            40063 (4.5x) |            39004 (4.6x) |
-|                               |                    |             |              |                        |                         |                         |
-| 1x Nvidia Quadro RTX 8000 Pa. |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
-| 2x Nvidia Quadro RTX 8000 Pa. |              29.86 |          96 |         1248 |            4767 (1.8x) |             9607 (1.8x) |            10214 (1.8x) |
-|                               |                    |             |              |                        |                         |                         |
-| 1x Nvidia GeForce RTX 2080 Ti |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
-| 2x Nvidia GeForce RTX 2080 Ti |              26.90 |          22 |         1232 |            5085 (1.6x) |            10770 (1.6x) |            10922 (1.6x) |
-| 4x Nvidia GeForce RTX 2080 Ti |              53.80 |          44 |         2464 |            9117 (2.9x) |            18415 (2.7x) |            18598 (2.7x) |
-| 7x RTX 2080 Ti + 1x A100 40GB |             107.60 |          88 |         4928 |           16146 (5.1x) |            33732 (5.0x) |            33857 (4.9x) |
+Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
+
+| Device                                                         | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
+| :------------------------------------------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
+|                                                                |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;1x&nbsp;Instinct&nbsp;MI250&nbsp;(1&nbsp;GCD)          |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
+| 🔴&nbsp;1x&nbsp;Instinct&nbsp;MI250&nbsp;(2&nbsp;GCD)          |              90.52 |         128 |         3277 |            9460 (1.7x) |            14313 (1.6x) |            17338 (2.0x) |
+| 🔴&nbsp;2x&nbsp;Instinct&nbsp;MI250&nbsp;(4&nbsp;GCD)          |             181.04 |         256 |         6554 |           16925 (3.0x) |            29163 (3.2x) |            29627 (3.5x) |
+| 🔴&nbsp;4x&nbsp;Instinct&nbsp;MI250&nbsp;(8&nbsp;GCD)          |             362.08 |         512 |        13107 |           27350 (4.9x) |            52258 (5.8x) |            53521 (6.3x) |
+|                                                                |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;1x&nbsp;Radeon&nbsp;VII                                |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
+| 🔴&nbsp;2x&nbsp;Radeon&nbsp;VII                                |              27.66 |          32 |         2048 |            8113 (1.7x) |            15591 (2.0x) |            10352 (2.0x) |
+| 🔴&nbsp;4x&nbsp;Radeon&nbsp;VII                                |              55.32 |          64 |         4096 |           12911 (2.6x) |            24273 (3.1x) |            17080 (3.2x) |
+| 🔴&nbsp;8x&nbsp;Radeon&nbsp;VII                                |             110.64 |         128 |         8192 |           21946 (4.5x) |            30826 (4.0x) |            24572 (4.7x) |
+|                                                                |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;1x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
+| 🟢&nbsp;2x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              38.98 |          80 |         3110 |           13629 (1.6x) |            24620 (1.5x) |            18850 (1.7x) |
+| 🟢&nbsp;4x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              77.96 |         160 |         6220 |           17978 (2.1x) |            30604 (1.9x) |            30627 (2.7x) |
+|                                                                |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;1x&nbsp;Tesla&nbsp;K40m                                 |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
+| 🟢&nbsp;2x&nbsp;Tesla&nbsp;K40m                                 |               8.58 |          24 |          577 |            1971 (1.7x) |             3300 (1.8x) |             1801 (2.0x) |
+| 🟢&nbsp;3x&nbsp;K40m&nbsp;+&nbsp;1x&nbsp;Titan&nbsp;Xp          |              17.16 |          48 |         1154 |            3117 (2.8x) |             5174 (2.8x) |             3127 (3.4x) |
+|                                                                |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;1x&nbsp;RTX&nbsp;A6000                                  |              40.00 |          48 |          768 |             4421 (88%) |              8814 (88%) |              8533 (86%) |
+| 🟢&nbsp;2x&nbsp;RTX&nbsp;A6000                                  |              80.00 |          96 |         1536 |            8041 (1.8x) |            15026 (1.7x) |            14795 (1.7x) |
+| 🟢&nbsp;4x&nbsp;RTX&nbsp;A6000                                  |             160.00 |         192 |         3072 |           14314 (3.2x) |            27915 (3.2x) |            27227 (3.2x) |
+| 🟢&nbsp;8x&nbsp;RTX&nbsp;A6000                                  |             320.00 |         384 |         6144 |           19311 (4.4x) |            40063 (4.5x) |            39004 (4.6x) |
+|                                                                |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;1x&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Pa.              |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
+| 🟢&nbsp;2x&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Pa.              |              29.86 |          96 |         1248 |            4767 (1.8x) |             9607 (1.8x) |            10214 (1.8x) |
+|                                                                |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;1x&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti              |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
+| 🟢&nbsp;2x&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti              |              26.90 |          22 |         1232 |            5085 (1.6x) |            10770 (1.6x) |            10922 (1.6x) |
+| 🟢&nbsp;4x&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti              |              53.80 |          44 |         2464 |            9117 (2.9x) |            18415 (2.7x) |            18598 (2.7x) |
+| 🟢&nbsp;7x&nbsp;2080&nbsp;Ti&nbsp;+&nbsp;1x&nbsp;A100&nbsp;40GB |             107.60 |          88 |         4928 |           16146 (5.1x) |            33732 (5.0x) |            33857 (4.9x) |
 
 
 

From 042f51a5d3168725c19d429110d26501ccf2770a Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Sat, 15 Apr 2023 10:57:41 +0200
Subject: [PATCH 4/5] Cosmetics in Readme

---
 README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index d9db1f69..8e0a75d5 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,18 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
     - allows for 19 Million cells per 1 GB VRAM
     - in-place streaming with [Esoteric-Pull](https://doi.org/10.3390/computation10060092): eliminates redundant copy `B` of density distribution functions (DDFs) in memory; almost cuts memory demand in half and slightly increases performance due to implicit bounce-back boundaries; offers optimal memory access patterns for single-cell in-place streaming
     - [decoupled arithmetic precision (FP32) and memory precision (FP32 or FP16S or FP16C)](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats): all arithmetic is done in FP32 for compatibility on all hardware, but DDFs in memory can be compressed to FP16S or FP16C: almost cuts memory demand in half again and almost doubles performance, without impacting overall accuracy for most setups
+    - <details><summary>only 8 flag bits per lattice point (can be used independently / at the same time)</summary>
+
+      - `TYPE_S` (stationary or moving) solid boundaries
+      - `TYPE_E` equilibrium boundaries (inflow/outflow)
+      - `TYPE_T` temperature boundaries
+      - `TYPE_F` free surface (fluid)
+      - `TYPE_I` free surface (interface)
+      - `TYPE_G` free surface (gas)
+      - `TYPE_X` remaining for custom use or further extensions
+      - `TYPE_Y` remaining for custom use or further extensions
+
+      </details>
   - large cost saving: comparison of maximum single-GPU grid resolution for D3Q19 LBM
 
     | GPU&nbsp;VRAM&nbsp;capacity      | 1&nbsp;GB | 2&nbsp;GB | 3&nbsp;GB | 4&nbsp;GB | 6&nbsp;GB | 8&nbsp;GB | 10&nbsp;GB | 11&nbsp;GB | 12&nbsp;GB | 16&nbsp;GB | 20&nbsp;GB | 24&nbsp;GB | 32&nbsp;GB | 40&nbsp;GB | 48&nbsp;GB | 64&nbsp;GB | 80&nbsp;GB | 94&nbsp;GB | 128&nbsp;GB | 192&nbsp;GB | 256&nbsp;GB |
@@ -196,18 +208,6 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 - [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) and other algebraic optimization to minimize round-off error
 - velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
 - collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
-- <details><summary>only 8 flag bits per lattice point (can be used independently / at the same time)</summary>
-
-  - `TYPE_S` (stationary or moving) solid boundaries
-  - `TYPE_E` equilibrium boundaries (inflow/outflow)
-  - `TYPE_T` temperature boundaries
-  - `TYPE_F` free surface (fluid)
-  - `TYPE_I` free surface (interface)
-  - `TYPE_G` free surface (gas)
-  - `TYPE_X` remaining for custom use or further extensions
-  - `TYPE_Y` remaining for custom use or further extensions
-
-  </details>
 
 
 

From 8c25a1f6624ce42b071c505dc34d836f69b42085 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Sun, 16 Apr 2023 12:16:31 +0200
Subject: [PATCH 5/5] FluidX3D v2.6 update: patched OpenCL issues of Intel Arc
 GPUs: now VRAM allocations >4GB are possible and correct VRAM capacity is
 reported

---
 README.md      |  8 +++++---
 src/info.cpp   |  2 +-
 src/opencl.hpp | 21 ++++++++++++++++++---
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 8e0a75d5..ec56d81b 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,8 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - improved raytracing framerate when camera is inside fluid
   - fixed skybox pole flickering artifacts
   - fixed bug where moving objects during re-voxelization would leave an erroneous trail of solid grid cells behind
+- v2.6 (16.04.2023)
+  - patched OpenCL issues of Intel Arc GPUs: now VRAM allocations >4GB are possible and correct VRAM capacity is reported
 
 </details>
 
@@ -66,8 +68,10 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - streaming (part 2/2)<p align="center"><i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>)<br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>(<i>t</i>%2 ? <i>i</i> : (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1))</sub>(<i>i</i>%2 ? <i>x</i> : <i>x</i>-<i>e<sub>i</sub></i>, <i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
   - collision<p align="center"><i>&rho;</i>(<i>x</i>,<i>t</i>) = (&Sigma;<sub><i>i</i></sub> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)) + 1<br><br><i>u</i>(<i>x</i>,<i>t</i>) = <sup>1</sup>&#8725;<sub><i>&rho;</i>(<i>x</i>,<i>t</i>)</sub> &Sigma;<sub><i>i</i></sub> <i>c<sub>i</sub></i> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)<br><br><i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>) = <i>w<sub>i</sub></i> <i>&rho;</i> · (<sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)<sup>2</sup></sup>&#8725;<sub>(2<i>c</i><sup>4</sup>)</sub> - <sup>(<i>u</i><sub>°</sub><i>u</i>)</sup>&#8725;<sub>(2c<sup>2</sup>)</sub> + <sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)</sup>&#8725;<sub><i>c</i><sup>2</sup></sub>) + <i>w<sub>i</sub></i> (<i>&rho;</i>-1)<br><br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) + <i>&Omega;<sub>i</sub></i>(<i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>), <i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>), <i>&tau;</i>)</p>
   - streaming (part 1/2)<p align="center"><i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>)<br><i>f</i><sub>(<i>t</i>%2 ? (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1) : <i>i</i>)</sub>(<i>i</i>%2 ? <i>x</i>+<i>e<sub>i</sub></i> : <i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p>
+  - velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
+  - collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
 
-</details>
+  </details>
 
 <!-- markdown equations don't render properly in mobile browser
   - streaming (part 2/2):
@@ -206,8 +210,6 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
   </details>
 - [peak performance on GPUs](#single-gpu-benchmarks) (datacenter/gaming/professional/laptop), validated with roofline model
 - [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) and other algebraic optimization to minimize round-off error
-- velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
-- collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
 
 
 
diff --git a/src/info.cpp b/src/info.cpp
index c02ac129..1d408ad0 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -67,7 +67,7 @@ void Info::print_logo() const {
 	print("|                                  ");                print("\\  \\ /  /", c);                 print("                                  |\n");
 	print("|                                   ");                print("\\  '  /", c);                  print("                                   |\n");
 	print("|                                    ");                print("\\   /", c);                  print("                                    |\n");
-	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.5 |\n");
+	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.6 |\n");
 	print("|                                      ");                 print("'", c);                  print("         Copyright (c) Moritz Lehmann |\n");
 }
 void Info::print_initialize() {
diff --git a/src/opencl.hpp b/src/opencl.hpp
index 7a21f20c..677a1c10 100644
--- a/src/opencl.hpp
+++ b/src/opencl.hpp
@@ -24,6 +24,7 @@ struct Device_Info {
 	uint compute_units=0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
 	uint clock_frequency=0u; // in MHz
 	bool is_cpu=false, is_gpu=false;
+	bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
 	uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
 	uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 	float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -63,6 +64,19 @@ struct Device_Info {
 		const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
 		cores = to_uint((float)compute_units*(nvidia+amd+intel+apple+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 		tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
+		if(intel==8.0f) { // fix wrong global memory reporting for Intel Arc GPUs
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory==12992u)) memory = 16240u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 16GB
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 8GB
+			if(contains_any(name, {"A750", "0x56a1"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A750 8GB
+			if(contains_any(name, {"A580", "0x56a2"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A580 8GB
+			if(contains_any(name, {"A380", "0x56a5"})&&(memory== 4844u)) memory =  6056u; // fix wrong (80% on Windows) memory reporting on Intel Arc A380 6GB
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory==15473u)) memory = 16288u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 16GB
+			if(contains_any(name, {"A770", "0x56a0"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 8GB
+			if(contains_any(name, {"A750", "0x56a1"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A750 8GB
+			if(contains_any(name, {"A580", "0x56a2"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A580 8GB
+			if(contains_any(name, {"A380"  "0x56a5"})&&(memory== 5783u)) memory =  6088u; // fix wrong (95% on Linux) memory reporting on Intel Arc A380 6GB
+		}
+		intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
 	}
 	inline Device_Info() {}; // default constructor
 };
@@ -161,11 +175,12 @@ class Device {
 		const string kernel_code = enable_device_capabilities()+"\n"+opencl_c_code;
 		cl_source.push_back({ kernel_code.c_str(), kernel_code.length() });
 		this->cl_program = cl::Program(info.cl_context, cl_source);
+		const string build_options = string("-cl-fast-relaxed-math")+(info.intel_gpu_above_4gb_patch ? " -cl-intel-greater-than-4GB-buffer-required" : "");
 #ifndef LOG
-		int error = cl_program.build({ info.cl_device }, "-cl-fast-relaxed-math -w"); // compile OpenCL C code, disable warnings
+		int error = cl_program.build({ info.cl_device }, (build_options+" -w").c_str()); // compile OpenCL C code, disable warnings
 		if(error) print_warning(cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device)); // print build log
 #else // LOG, generate logfile for OpenCL code compilation
-		int error = cl_program.build({ info.cl_device }, "-cl-fast-relaxed-math"); // compile OpenCL C code
+		int error = cl_program.build({ info.cl_device }, build_options.c_str()); // compile OpenCL C code
 		const string log = cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device);
 		write_file("bin/kernel.log", log); // save build log
 		if((uint)log.length()>2u) print_warning(log); // print build log
@@ -210,7 +225,7 @@ template<typename T> class Memory {
 			device.info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage
 			if(device.info.memory_used>device.info.memory) print_error("Device \""+device.info.name+"\" does not have enough memory. Allocating another "+to_string((uint)(capacity()/1048576ull))+" MB would use a total of "+to_string(device.info.memory_used)+" MB / "+to_string(device.info.memory)+" MB.");
 			int error = 0;
-			device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE, capacity(), nullptr, &error);
+			device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE|((int)device.info.intel_gpu_above_4gb_patch<<23), capacity(), nullptr, &error); // for Intel GPUs, set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
 			if(error==-61) print_error("Memory size is too large at "+to_string((uint)(capacity()/1048576ull))+" MB. Device \""+device.info.name+"\" accepts a maximum buffer size of "+to_string(device.info.max_global_buffer)+" MB.");
 			else if(error) print_error("Device buffer allocation failed with error code "+to_string(error)+".");
 			device_buffer_exists = true;