diff --git a/README.md b/README.md
index 9e6f8b8a..21586ac8 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,11 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - fixed bug in make.sh where multi-GPU device IDs would not get forwarded to the executable
   - minor bug fixes in graphics engine (free cursor not centered during rotation, labels in VR mode)
   - fixed bug in LBM::voxelize_stl() size parameter standard initialization
+- v2.5 (11.04.2023)
+  - implemented light absorption in fluid for raytracing graphics (no performance impact)
+  - improved raytracing framerate when camera is inside fluid
+  - fixed skybox pole flickering artifacts
+  - fixed bug where moving objects during re-voxelization would leave an erroneous trail of solid grid cells behind
 
 </details>
 
@@ -57,12 +62,11 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
 ## Compute Features
 
 - CFD model: lattice Boltzmann method (LBM)
-</details><details><summary>&nbsp; &nbsp; &nbsp; &nbsp;&#9900;&nbsp; streaming (part 2/2)</summary><p align="center"><i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>)<br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>(<i>t</i>%2 ? <i>i</i> : (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1))</sub>(<i>i</i>%2 ? <i>x</i> : <i>x</i>-<i>e<sub>i</sub></i>, <i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p></details>
-</details><details><summary>&nbsp; &nbsp; &nbsp; &nbsp;&#9900;&nbsp; collision</summary><p align="center"><i>&rho;</i>(<i>x</i>,<i>t</i>) = (&Sigma;<sub><i>i</i></sub> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)) + 1<br><br><i>u</i>(<i>x</i>,<i>t</i>) = <sup>1</sup>&#8725;<sub><i>&rho;</i>(<i>x</i>,<i>t</i>)</sub> &Sigma;<sub><i>i</i></sub> <i>c<sub>i</sub></i> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)<br><br><i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>) = <i>w<sub>i</sub></i> <i>&rho;</i> · (<sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)<sup>2</sup></sup>&#8725;<sub>(2<i>c</i><sup>4</sup>)</sub> - <sup>(<i>u</i><sub>°</sub><i>u</i>)</sup>&#8725;<sub>(2c<sup>2</sup>)</sub> + <sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)</sup>&#8725;<sub><i>c</i><sup>2</sup></sub>) + <i>w<sub>i</sub></i> (<i>&rho;</i>-1)<br><br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) + <i>&Omega;<sub>i</sub></i>(<i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>), <i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>), <i>&tau;</i>)</p></details>
-</details><details><summary>&nbsp; &nbsp; &nbsp; &nbsp;&#9900;&nbsp; streaming (part 1/2)</summary><p align="center"><i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>)<br><i>f</i><sub>(<i>t</i>%2 ? (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1) : <i>i</i>)</sub>(<i>i</i>%2 ? <i>x</i>+<i>e<sub>i</sub></i> : <i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p></details>
+  <details><summary>&#9900; &nbsp;streaming (part 2/2)</summary><p align="center"><i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>)<br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) = <i>f</i><sub>(<i>t</i>%2 ? <i>i</i> : (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1))</sub>(<i>i</i>%2 ? <i>x</i> : <i>x</i>-<i>e<sub>i</sub></i>, <i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p></details>
+  <details><summary>&#9900; &nbsp;collision</summary><p align="center"><i>&rho;</i>(<i>x</i>,<i>t</i>) = (&Sigma;<sub><i>i</i></sub> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)) + 1<br><br><i>u</i>(<i>x</i>,<i>t</i>) = <sup>1</sup>&#8725;<sub><i>&rho;</i>(<i>x</i>,<i>t</i>)</sub> &Sigma;<sub><i>i</i></sub> <i>c<sub>i</sub></i> <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>)<br><br><i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>) = <i>w<sub>i</sub></i> <i>&rho;</i> · (<sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)<sup>2</sup></sup>&#8725;<sub>(2<i>c</i><sup>4</sup>)</sub> - <sup>(<i>u</i><sub>°</sub><i>u</i>)</sup>&#8725;<sub>(2c<sup>2</sup>)</sub> + <sup>(<i>u</i><sub>°</sub><i>c<sub>i</sub></i>)</sup>&#8725;<sub><i>c</i><sup>2</sup></sub>) + <i>w<sub>i</sub></i> (<i>&rho;</i>-1)<br><br><i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>) + <i>&Omega;<sub>i</sub></i>(<i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>,<i>t</i>), <i>f<sub>i</sub></i><sup>eq-shifted</sup>(<i>x</i>,<i>t</i>), <i>&tau;</i>)</p></details>
+  <details><summary>&#9900; &nbsp;streaming (part 1/2)</summary><p align="center"><i>f</i><sub>0</sub>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f</i><sub>0</sub><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>)<br><i>f</i><sub>(<i>t</i>%2 ? (<i>i</i>%2 ? <i>i</i>+1 : <i>i</i>-1) : <i>i</i>)</sub>(<i>i</i>%2 ? <i>x</i>+<i>e<sub>i</sub></i> : <i>x</i>, <i>t</i>+&Delta;<i>t</i>) = <i>f<sub>i</sub></i><sup>temp</sup>(<i>x</i>, <i>t</i>+&Delta;<i>t</i>) &nbsp; for &nbsp; <i>i</i> &isin; [1, <i>q</i>-1]</p></details>
 
 <!-- markdown equations don't render properly in mobile browser
-
   - streaming (part 2/2):
 $$j=0\\ \textrm{for}\\ i=0$$
 $$j=t\\%2\\ ?\\ i\\ :\\ (i\\%2\\ ?\\ i+1\\ :\\ i-1)\\ \textrm{for}\\ i\in[1,q-1]$$
@@ -76,104 +80,126 @@ $$f_i^\textrm{temp}(\vec{x},\\ t+\Delta t)=f_i^\textrm{temp}(\vec{x},t)+\Omega_i
 $$j=0\\ \textrm{for}\\ i=0$$
 $$j=t\\%2\\ ?\\ (i\\%2\\ ?\\ i+1\\ :\\ i-1)\\ :\\ i\\ \textrm{for}\\ i\in[1,q-1]$$
 $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{temp}(\vec{x},\\ t+\Delta t)$$
-
  -->
 
-- peak performance on most GPUs (datacenter/gaming/professional/laptop), validated with roofline model
+- peak performance on GPUs (datacenter/gaming/professional/laptop), validated with roofline model
+- optimized to minimize memory demand:
+  - traditional LBM (D3Q19) with FP64 requires ~344 Bytes/cell
+    ```
+    🟧🟧🟧🟧🟧🟧🟧🟧🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
+    🟨🟨🟨🟨🟨🟨🟨🟨🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
+    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
+    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
+    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
+    🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥🟥
+
+    (density 🟧, velocity 🟦, flags 🟨, 2 copies of DDFs 🟩/🟥; each square = 1 Byte)
+    ```
+    - allows for 3 Million cells per 1 GB VRAM
+  - FluidX3D (D3Q19) requires only 55 Bytes/cell with [Esoteric-Pull](https://doi.org/10.3390/computation10060092)+[FP16](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats)
+    ```
+    🟧🟧🟧🟧🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟨🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+    🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩
+
+    (density 🟧, velocity 🟦, flags 🟨, DDFs 🟩; each square = 1 Byte)
+    ```
+    - allows for 19 Million cells per 1 GB VRAM
+    - in-place streaming with [Esoteric-Pull](https://doi.org/10.3390/computation10060092): eliminates redundant copy `B` of density distribution functions (DDFs) in memory; almost cuts memory demand in half and slightly increases performance due to implicit bounce-back boundaries; offers optimal memory access patterns for single-cell in-place streaming
+    - [decoupled arithmetic precision (FP32) and memory precision (FP32 or FP16S or FP16C)](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats): all arithmetic is done in FP32 for compatibility on all hardware, but DDFs in memory can be compressed to FP16S or FP16C: almost cuts memory demand in half again and almost doubles performance, without impacting overall accuracy for most setups
 - multi-GPU support on a single node (PC/laptop/server) via domain decomposition
   - allows pooling VRAM from multiple GPUs for much larger grid resolution
   - each domain (GPU) can hold up to 4.29 billion (2³², 1624³) lattice points (225 GB memory)
   - GPUs don't have to be identical (not even from the same vendor), but similar VRAM capacity/bandwidth is recommended
-
-<details><summary>&nbsp; &nbsp; &nbsp; &nbsp;&#9900;&nbsp; domain communication architecture (simplified)</summary>
-
-```diff
-++   .-----------------------------------------------------------------.   ++
-++   |                              GPU 0                              |   ++
-++   |                          LBM Domain 0                           |   ++
-++   '-----------------------------------------------------------------'   ++
-++              |                 selective                /|\             ++
-++             \|/               in-VRAM copy               |              ++
-++        .-------------------------------------------------------.        ++
-++        |               GPU 0 - Transfer Buffer 0               |        ++
-++        '-------------------------------------------------------'        ++
-!!                            |     PCIe     /|\                           !!
-!!                           \|/    copy      |                            !!
-@@        .-------------------------.   .-------------------------.        @@
-@@        | CPU - Transfer Buffer 0 |   | CPU - Transfer Buffer 1 |        @@
-@@        '-------------------------'\ /'-------------------------'        @@
-@@                           pointer  X   swap                             @@
-@@        .-------------------------./ \.-------------------------.        @@
-@@        | CPU - Transfer Buffer 1 |   | CPU - Transfer Buffer 0 |        @@
-@@        '-------------------------'   '-------------------------'        @@
-!!                           /|\    PCIe      |                            !!
-!!                            |     copy     \|/                           !!
-++        .-------------------------------------------------------.        ++
-++        |               GPU 1 - Transfer Buffer 1               |        ++
-++        '-------------------------------------------------------'        ++
-++             /|\                selective                 |              ++
-++              |                in-VRAM copy              \|/             ++
-++   .-----------------------------------------------------------------.   ++
-++   |                              GPU 1                              |   ++
-++   |                          LBM Domain 1                           |   ++
-++   '-----------------------------------------------------------------'   ++
-##                                    |                                    ##
-##                      domain synchronization barrier                     ##
-##                                    |                                    ##
-||   -------------------------------------------------------------> time   ||
-```
-
-</details><details><summary>&nbsp; &nbsp; &nbsp; &nbsp;&#9900;&nbsp; domain communication architecture (detailed)</summary>
-
-```diff
-++   .-----------------------------------------------------------------.   ++
-++   |                              GPU 0                              |   ++
-++   |                          LBM Domain 0                           |   ++
-++   '-----------------------------------------------------------------'   ++
-++     |  selective in- /|\  |  selective in- /|\  |  selective in- /|\    ++
-++    \|/ VRAM copy (X)  |  \|/ VRAM copy (Y)  |  \|/ VRAM copy (Z)  |     ++
-++   .---------------------.---------------------.---------------------.   ++
-++   |    GPU 0 - TB 0X+   |    GPU 0 - TB 0Y+   |    GPU 0 - TB 0Z+   |   ++
-++   |    GPU 0 - TB 0X-   |    GPU 0 - TB 0Y-   |    GPU 0 - TB 0Z-   |   ++
-++   '---------------------'---------------------'---------------------'   ++
-!!          | PCIe /|\            | PCIe /|\            | PCIe /|\         !!
-!!         \|/ copy |            \|/ copy |            \|/ copy |          !!
-@@   .---------. .---------.---------. .---------.---------. .---------.   @@
-@@   | CPU 0X+ | | CPU 1X- | CPU 0Y+ | | CPU 3Y- | CPU 0Z+ | | CPU 5Z- |   @@
-@@   | CPU 0X- | | CPU 2X+ | CPU 0Y- | | CPU 4Y+ | CPU 0Z- | | CPU 6Z+ |   @@
-@@   '---------\ /---------'---------\ /---------'---------\ /---------'   @@
-@@      pointer X swap (X)    pointer X swap (Y)    pointer X swap (Z)     @@
-@@   .---------/ \---------.---------/ \---------.---------/ \---------.   @@
-@@   | CPU 1X- | | CPU 0X+ | CPU 3Y- | | CPU 0Y+ | CPU 5Z- | | CPU 0Z+ |   @@
-@@   | CPU 2X+ | | CPU 0X- | CPU 4Y+ | | CPU 0Y- | CPU 6Z+ | | CPU 0Z- |   @@
-@@   '---------' '---------'---------' '---------'---------' '---------'   @@
-!!         /|\ PCIe |            /|\ PCIe |            /|\ PCIe |          !!
-!!          | copy \|/            | copy \|/            | copy \|/         !!
-++   .--------------------. .-------------------. .--------------------.   ++
-++   |   GPU 1 - TB 1X-   | |   GPU 3 - TB 3Y-  | |   GPU 5 - TB 5Z-   |   ++
-++   :====================: :===================: :====================:   ++
-++   |   GPU 2 - TB 2X+   | |   GPU 4 - TB 4Y+  | |   GPU 6 - TB 6Z+   |   ++
-++   '--------------------' '-------------------' '--------------------'   ++
-++    /|\ selective in-  |  /|\ selective in-  |  /|\ selective in-  |     ++
-++     |  VRAM copy (X) \|/  |  VRAM copy (Y) \|/  |  VRAM copy (Z) \|/    ++
-++   .--------------------. .-------------------. .--------------------.   ++
-++   |        GPU 1       | |       GPU 3       | |        GPU 5       |   ++
-++   |    LBM Domain 1    | |   LBM Domain 3    | |    LBM Domain 5    |   ++
-++   :====================: :===================: :====================:   ++
-++   |        GPU 2       | |       GPU 4       | |        GPU 6       |   ++
-++   |    LBM Domain 2    | |   LBM Domain 4    | |    LBM Domain 6    |   ++
-++   '--------------------' '-------------------' '--------------------'   ++
-##              |                     |                     |              ##
-##              |      domain synchronization barriers      |              ##
-##              |                     |                     |              ##
-||   -------------------------------------------------------------> time   ||
-```
-
-</details>
-
-- optimized to minimize memory demand to 55 Bytes/node (~⅙|~⅓ of conventional FP64|FP32 LBM solvers)
-  - in-place streaming with [Esoteric-Pull](https://doi.org/10.3390/computation10060092): almost cuts memory demand in half and slightly increases performance due to implicit bounce-back boundaries; offers optimal memory access patterns for single-cell in-place streaming
-  - [decoupled arithmetic precision (FP32) and memory precision (FP32 or FP16S or FP16C)](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats): all arithmetic is done in FP32 for compatibility on all hardware, but LBM density distribution functions in memory can be compressed to FP16S or FP16C: almost cuts memory demand in half again and almost doubles performance, without impacting overall accuracy for most setups
+  <details><summary>&#9900; &nbsp;domain communication architecture (simplified)</summary>
+
+  ```diff
+  ++   .-----------------------------------------------------------------.   ++
+  ++   |                              GPU 0                              |   ++
+  ++   |                          LBM Domain 0                           |   ++
+  ++   '-----------------------------------------------------------------'   ++
+  ++              |                 selective                /|\             ++
+  ++             \|/               in-VRAM copy               |              ++
+  ++        .-------------------------------------------------------.        ++
+  ++        |               GPU 0 - Transfer Buffer 0               |        ++
+  ++        '-------------------------------------------------------'        ++
+  !!                            |     PCIe     /|\                           !!
+  !!                           \|/    copy      |                            !!
+  @@        .-------------------------.   .-------------------------.        @@
+  @@        | CPU - Transfer Buffer 0 |   | CPU - Transfer Buffer 1 |        @@
+  @@        '-------------------------'\ /'-------------------------'        @@
+  @@                           pointer  X   swap                             @@
+  @@        .-------------------------./ \.-------------------------.        @@
+  @@        | CPU - Transfer Buffer 1 |   | CPU - Transfer Buffer 0 |        @@
+  @@        '-------------------------'   '-------------------------'        @@
+  !!                           /|\    PCIe      |                            !!
+  !!                            |     copy     \|/                           !!
+  ++        .-------------------------------------------------------.        ++
+  ++        |               GPU 1 - Transfer Buffer 1               |        ++
+  ++        '-------------------------------------------------------'        ++
+  ++             /|\                selective                 |              ++
+  ++              |                in-VRAM copy              \|/             ++
+  ++   .-----------------------------------------------------------------.   ++
+  ++   |                              GPU 1                              |   ++
+  ++   |                          LBM Domain 1                           |   ++
+  ++   '-----------------------------------------------------------------'   ++
+  ##                                    |                                    ##
+  ##                      domain synchronization barrier                     ##
+  ##                                    |                                    ##
+  ||   -------------------------------------------------------------> time   ||
+  ```
+
+  </details><details><summary>&#9900; &nbsp;domain communication architecture (detailed)</summary>
+
+  ```diff
+  ++   .-----------------------------------------------------------------.   ++
+  ++   |                              GPU 0                              |   ++
+  ++   |                          LBM Domain 0                           |   ++
+  ++   '-----------------------------------------------------------------'   ++
+  ++     |  selective in- /|\  |  selective in- /|\  |  selective in- /|\    ++
+  ++    \|/ VRAM copy (X)  |  \|/ VRAM copy (Y)  |  \|/ VRAM copy (Z)  |     ++
+  ++   .---------------------.---------------------.---------------------.   ++
+  ++   |    GPU 0 - TB 0X+   |    GPU 0 - TB 0Y+   |    GPU 0 - TB 0Z+   |   ++
+  ++   |    GPU 0 - TB 0X-   |    GPU 0 - TB 0Y-   |    GPU 0 - TB 0Z-   |   ++
+  ++   '---------------------'---------------------'---------------------'   ++
+  !!          | PCIe /|\            | PCIe /|\            | PCIe /|\         !!
+  !!         \|/ copy |            \|/ copy |            \|/ copy |          !!
+  @@   .---------. .---------.---------. .---------.---------. .---------.   @@
+  @@   | CPU 0X+ | | CPU 1X- | CPU 0Y+ | | CPU 3Y- | CPU 0Z+ | | CPU 5Z- |   @@
+  @@   | CPU 0X- | | CPU 2X+ | CPU 0Y- | | CPU 4Y+ | CPU 0Z- | | CPU 6Z+ |   @@
+  @@   '---------\ /---------'---------\ /---------'---------\ /---------'   @@
+  @@      pointer X swap (X)    pointer X swap (Y)    pointer X swap (Z)     @@
+  @@   .---------/ \---------.---------/ \---------.---------/ \---------.   @@
+  @@   | CPU 1X- | | CPU 0X+ | CPU 3Y- | | CPU 0Y+ | CPU 5Z- | | CPU 0Z+ |   @@
+  @@   | CPU 2X+ | | CPU 0X- | CPU 4Y+ | | CPU 0Y- | CPU 6Z+ | | CPU 0Z- |   @@
+  @@   '---------' '---------'---------' '---------'---------' '---------'   @@
+  !!         /|\ PCIe |            /|\ PCIe |            /|\ PCIe |          !!
+  !!          | copy \|/            | copy \|/            | copy \|/         !!
+  ++   .--------------------..---------------------..--------------------.   ++
+  ++   |   GPU 1 - TB 1X-   ||    GPU 3 - TB 3Y-   ||   GPU 5 - TB 5Z-   |   ++
+  ++   :====================::=====================::====================:   ++
+  ++   |   GPU 2 - TB 2X+   ||    GPU 4 - TB 4Y+   ||   GPU 6 - TB 6Z+   |   ++
+  ++   '--------------------''---------------------''--------------------'   ++
+  ++    /|\ selective in-  |  /|\ selective in-  |  /|\ selective in-  |     ++
+  ++     |  VRAM copy (X) \|/  |  VRAM copy (Y) \|/  |  VRAM copy (Z) \|/    ++
+  ++   .--------------------..---------------------..--------------------.   ++
+  ++   |        GPU 1       ||        GPU 3        ||        GPU 5       |   ++
+  ++   |    LBM Domain 1    ||    LBM Domain 3     ||    LBM Domain 5    |   ++
+  ++   :====================::=====================::====================:   ++
+  ++   |        GPU 2       ||        GPU 4        ||        GPU 6       |   ++
+  ++   |    LBM Domain 2    ||    LBM Domain 4     ||    LBM Domain 6    |   ++
+  ++   '--------------------''---------------------''--------------------'   ++
+  ##              |                     |                     |              ##
+  ##              |      domain synchronization barriers      |              ##
+  ##              |                     |                     |              ##
+  ||   -------------------------------------------------------------> time   ||
+  ```
+
+  </details>
 - [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) and other algebraic optimization to minimize round-off error
 - velocity sets: D2Q9, D3Q15, D3Q19 (default), D3Q27
 - collision operators: single-relaxation-time (SRT/BGK) (default), two-relaxation-time (TRT)
@@ -221,7 +247,7 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 - rendering is fully parallelized for multi-GPU via seamless domain decomposition rasterization
 - with interactive graphics mode disabled, image resolution can be as large as VRAM allows for (132 Megapixel (16K) and above)
 - (interacitive) visualization modes:
-  - flags (and force vectors on solid boundary nodes if the extension is used)
+  - flags (and force vectors on solid boundary cells if the extension is used)
   - velocity field
   - streamlines
   - velocity-colored Q-criterion isosurface
@@ -243,12 +269,13 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 4. Keyboard/mouse controls with `INTERACTIVE_GRAPHICS`/`INTERACTIVE_GRAPHICS_ASCII` enabled:
    - <kbd>P</kbd>: start/pause the simulation
    - <kbd>H</kbd>: show/hide help
-   - <kbd>1</kbd>: flags (and force vectors on solid boundary nodes if the extension is used)
+   - <kbd>1</kbd>: flags (and force vectors on solid boundary cells if the extension is used)
    - <kbd>2</kbd>: velocity field
    - <kbd>3</kbd>: streamlines
    - <kbd>4</kbd>: vorticity / velocity-colored Q-criterion isosurface
    - <kbd>5</kbd>: rasterized free surface
    - <kbd>6</kbd>: raytraced free surface
+   - <kbd>7</kbd>: particles
    - <kbd>Mouse</kbd> or <kbd>I</kbd>/<kbd>J</kbd>/<kbd>K</kbd>/<kbd>L</kbd>: rotate camera
    - <kbd>Scrollwheel</kbd> or <kbd>+</kbd>/<kbd>-</kbd>: zoom (centered camera mode) or camera movement speed (free camera mode)
    - <kbd>Mouseclick</kbd> or <kbd>U</kbd>: toggle rotation with <kbd>Mouse</kbd> and angle snap rotation with <kbd>I</kbd>/<kbd>J</kbd>/<kbd>K</kbd>/<kbd>L</kbd>
@@ -323,6 +350,7 @@ If your GPU is not on the list yet, you can report your benchmarks [here](https:
 | Nvidia GeForce RTX 3090 Ti    |              40.00 |          24 |         1008 |             5717 (87%) |             10956 (84%) |             10400 (79%) |
 | Nvidia GeForce RTX 3090       |              39.05 |          24 |          936 |             5418 (89%) |             10732 (88%) |             10215 (84%) |
 | Nvidia GeForce RTX 3080 Ti    |              37.17 |          12 |          912 |             5202 (87%) |              9832 (87%) |              9347 (79%) |
+| Nvidia RTX A6000              |              40.00 |          48 |          768 |             4421 (88%) |              8814 (88%) |              8533 (86%) |
 | Nvidia GeForce RTX 3080       |              29.77 |          10 |          760 |             4230 (85%) |              8118 (82%) |              7714 (78%) |
 | Nvidia GeForce RTX 3070       |              20.31 |           8 |          448 |             2578 (88%) |              5096 (88%) |              5060 (87%) |
 | Nvidia GeForce RTX 3060 Ti    |              16.49 |           8 |          448 |             2644 (90%) |              5129 (88%) |              4718 (81%) |
@@ -394,6 +422,7 @@ Multi-GPU benchmarks are done at the largest possible grid resolution with a cub
 | 1x AMD Radeon VII             |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
 | 2x AMD Radeon VII             |              27.66 |          32 |         2048 |            8113 (1.7x) |            15591 (2.0x) |            10352 (2.0x) |
 | 4x AMD Radeon VII             |              55.32 |          64 |         4096 |           12911 (2.6x) |            24273 (3.1x) |            17080 (3.2x) |
+| 8x AMD Radeon VII             |             110.64 |         128 |         8192 |           21946 (4.5x) |            30826 (4.0x) |            24572 (4.7x) |
 |                               |                    |             |              |                        |                         |                         |
 | 1x Nvidia A100 SXM4 40GB      |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
 | 2x Nvidia A100 SXM4 40GB      |              38.98 |          80 |         3110 |           13629 (1.6x) |            24620 (1.5x) |            18850 (1.7x) |
@@ -401,12 +430,20 @@ Multi-GPU benchmarks are done at the largest possible grid resolution with a cub
 |                               |                    |             |              |                        |                         |                         |
 | 1x Nvidia Tesla K40m          |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
 | 2x Nvidia Tesla K40m          |               8.58 |          24 |          577 |            1971 (1.7x) |             3300 (1.8x) |             1801 (2.0x) |
+| 3x Tesla K40m + 1x Titan Xp   |              17.16 |          48 |         1154 |            3117 (2.8x) |             5174 (2.8x) |             3127 (3.4x) |
+|                               |                    |             |              |                        |                         |                         |
+| 1x Nvidia RTX A6000           |              40.00 |          48 |          768 |             4421 (88%) |              8814 (88%) |              8533 (86%) |
+| 2x Nvidia RTX A6000           |              80.00 |          96 |         1536 |            8041 (1.8x) |            15026 (1.7x) |            14795 (1.7x) |
+| 4x Nvidia RTX A6000           |             160.00 |         192 |         3072 |           14314 (3.2x) |            27915 (3.2x) |            27227 (3.2x) |
+| 8x Nvidia RTX A6000           |             320.00 |         384 |         6144 |           19311 (4.4x) |            40063 (4.5x) |            39004 (4.6x) |
 |                               |                    |             |              |                        |                         |                         |
 | 1x Nvidia Quadro RTX 8000 Pa. |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
 | 2x Nvidia Quadro RTX 8000 Pa. |              29.86 |          96 |         1248 |            4767 (1.8x) |             9607 (1.8x) |            10214 (1.8x) |
 |                               |                    |             |              |                        |                         |                         |
 | 1x Nvidia GeForce RTX 2080 Ti |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
 | 2x Nvidia GeForce RTX 2080 Ti |              26.90 |          22 |         1232 |            5085 (1.6x) |            10770 (1.6x) |            10922 (1.6x) |
+| 4x Nvidia GeForce RTX 2080 Ti |              53.80 |          44 |         2464 |            9117 (2.9x) |            18415 (2.7x) |            18598 (2.7x) |
+| 7x RTX 2080 Ti + 1x A100 40GB |             107.60 |          88 |         4928 |           16146 (5.1x) |            33732 (5.0x) |            33857 (4.9x) |
 
 
 
@@ -449,7 +486,7 @@ Multi-GPU benchmarks are done at the largest possible grid resolution with a cub
 
 - <details><summary>Why is the domain size limited to 2³² grid points?</summary><br>The 32-bit unsigned integer grid index will overflow above this number. Using 64-bit index calculation would slow the simulation down by ~20%, as 64-bit uint is calculated on special function units and not the regular GPU cores. 2³² grid points with FP32/FP16 mixed-precision is equivalent to 225GB memory and single GPUs currently are only at 128GB, so it should be fine for a while to come. For higher resolutions above the single-domain limit, use multiple domains (typically 1 per GPU, but multiple domains on the same GPU also work).<br><br></details>
 
-- <details><summary>Comparted to the benchmark numbers stated <a href="https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats">here</a>, efficiency seems much lower but performance is slightly better for most devices. How can this be?</summary><br>In that paper, the One-Step-Pull swap algorithm is implemented, using only misaligned reads and coalesced writes. On almost all GPUs, the performance penalty for misaligned writes is much larger than for misaligned reads, and sometimes there is almost no penalty for misaligned reads at all. Because of this, One-Step-Pull runs at peak bandwidth and thus peak efficiency.<br>Here, a different swap algorithm termed <a href="https://doi.org/10.3390/computation10060092">Esoteric-Pull</a> is used, a type of in-place streaming. This makes the LBM require much less memory (93 vs. 169 (FP32/FP32) or 55 vs. 93 (FP32/FP16) Bytes/node for D3Q19), and also less memory bandwidth (153 vs. 171 (FP32/FP32) or 77 vs. 95 (FP32/FP16) Bytes/node per time step for D3Q19) due to so-called implicit bounce-back boundaries. However memory access now is half coalesced and half misaligned for both reads and writes, so memory access efficiency is lower. For overall performance, these two effects approximately cancel out. The benefit of Esoteric-Pull - being able to simulate domains twice as large with the same amount of memory - clearly outweights the cost of slightly lower memory access efficiency, especially since performance is not reduced overall.<br><br></details>
+- <details><summary>Comparted to the benchmark numbers stated <a href="https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats">here</a>, efficiency seems much lower but performance is slightly better for most devices. How can this be?</summary><br>In that paper, the One-Step-Pull swap algorithm is implemented, using only misaligned reads and coalesced writes. On almost all GPUs, the performance penalty for misaligned writes is much larger than for misaligned reads, and sometimes there is almost no penalty for misaligned reads at all. Because of this, One-Step-Pull runs at peak bandwidth and thus peak efficiency.<br>Here, a different swap algorithm termed <a href="https://doi.org/10.3390/computation10060092">Esoteric-Pull</a> is used, a type of in-place streaming. This makes the LBM require much less memory (93 vs. 169 (FP32/FP32) or 55 vs. 93 (FP32/FP16) Bytes/cell for D3Q19), and also less memory bandwidth (153 vs. 171 (FP32/FP32) or 77 vs. 95 (FP32/FP16) Bytes/cell per time step for D3Q19) due to so-called implicit bounce-back boundaries. However memory access now is half coalesced and half misaligned for both reads and writes, so memory access efficiency is lower. For overall performance, these two effects approximately cancel out. The benefit of Esoteric-Pull - being able to simulate domains twice as large with the same amount of memory - clearly outweights the cost of slightly lower memory access efficiency, especially since performance is not reduced overall.<br><br></details>
 
 - <details><summary>Why don't you use CUDA? Wouldn't that be more efficient?</summary><br>No, that is a wrong myth. OpenCL is exactly as efficient as CUDA on Nvidia GPUs if optimized properly. <a href="https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats">Here</a> I did roofline model and analyzed OpenCL performance on various hardware. OpenCL efficiency on modern Nvidia GPUs can be 100% with the right memory access pattern, so CUDA can't possibly be any more efficient. Without any performance advantage, there is no reason to use proprietary CUDA over OpenCL, since OpenCL is compatible with a lot more hardware.<br><br></details>
 
diff --git a/src/defines.hpp b/src/defines.hpp
index 3548a5dc..ded587f7 100644
--- a/src/defines.hpp
+++ b/src/defines.hpp
@@ -36,6 +36,8 @@
 #define GRAPHICS_BOUNDARY_FORCE_SCALE 100.0f // scaling factor for visualization of forces on solid boundaries if VOLUME_FORCE is enabled and lbm.calculate_force_on_boundaries(); is called (default: 100.0f)
 #define GRAPHICS_STREAMLINE_SPARSE 4 // set how many streamlines there are every x lattice points
 #define GRAPHICS_STREAMLINE_LENGTH 128 // set maximum length of streamlines
+#define GRAPHICS_RAYTRACING_TRANSMITTANCE 0.25f // transmitted light fraction in raytracing graphics ("0.25f" = 1/4 of light is transmitted and 3/4 is absorbed along longest box side length, "1.0f" = no absorption)
+#define GRAPHICS_RAYTRACING_COLOR 0x005F7F // absorption color of fluid in raytracing graphics
 
 
 
diff --git a/src/info.cpp b/src/info.cpp
index 823d1a30..c02ac129 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -67,7 +67,7 @@ void Info::print_logo() const {
 	print("|                                  ");                print("\\  \\ /  /", c);                 print("                                  |\n");
 	print("|                                   ");                print("\\  '  /", c);                  print("                                   |\n");
 	print("|                                    ");                print("\\   /", c);                  print("                                    |\n");
-	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.4 |\n");
+	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.5 |\n");
 	print("|                                      ");                 print("'", c);                  print("         Copyright (c) Moritz Lehmann |\n");
 }
 void Info::print_initialize() {
diff --git a/src/kernel.cpp b/src/kernel.cpp
index 381a91b0..5e4e0cb4 100644
--- a/src/kernel.cpp
+++ b/src/kernel.cpp
@@ -599,7 +599,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 	camray.direction = p1-p0;
 	return camray;
 }
-)+R(uint skybox_bottom(const ray r, const uint skybox_color) {
+)+R(uint skybox_bottom(const ray r, const int c1, const int c2, const uint skybox_color) {
 	const float3 p0=(float3)(0.0f, 0.0f, -0.5f*(float)def_Nz), p1=(float3)(1.0f, 0.0f, -0.5f*(float)def_Nz), p2=(float3)(0.0f, 1.0f, -0.5f*(float)def_Nz);
 	const float distance = intersect_plane(r, p0, p1, p2);
 	if(distance>0.0f) { // ray intersects with bottom
@@ -609,7 +609,8 @@ string opencl_c_container() { return R( // ########################## begin of O
 		int a = abs((int)floor(scale*intersection.x));
 		int b = abs((int)floor(scale*intersection.y));
 		const float r = scale*sqrt(sq(intersection.x)+sq(intersection.y));
-		return color_mix((a%2==b%2)*0xFFFFFF, skybox_color, clamp(2.0f/r, 0.0f, 1.0f));
+		const int w = (a%2==b%2);
+		return color_mix(w*c1+(1-w)*c2, color_mix(c1, c2, 0.5f), clamp(10.0f/r, 0.0f, 1.0f));
 	} else {
 		return skybox_color;
 	}
@@ -626,25 +627,28 @@ string opencl_c_container() { return R( // ########################## begin of O
 )+R(uint skybox_color_sunset(const float x, const float y) {
 	return color_mix(255<<16|175<<8|55, y<0.5f ? 55<<16|111<<8|255 : 0, 2.0f*(0.5f-fabs(y-0.5f)));
 }
-)+R(uint skybox_color_grid(const float x, const float y) {
-	int a = (int)(36.0f*x);
-	int b = (int)(18.0f*y);
-	return 0xFFFFFF*(a%2==b%2);
+)+R(uint skybox_color_grid(const float x, const float y, const int c1, const int c2) {
+	int a = (int)(72.0f*x);
+	int b = (int)(36.0f*y);
+	const int w = (a%2==b%2);
+	return w*c1+(1-w)*c2;
 }
 )+R(uint skybox_color(const ray r, const global int* skybox) {
-	//const float x = fma(atan2(r.direction.x, r.direction.y),  0.5f/3.1415927f, 0.5f);
-	//const float y = fma(asin (r.direction.z               ), -1.0f/3.1415927f, 0.5f);
-	//return color_mix(skybox_color_hsv(x, y), skybox_color_grid(x, y), 0.95f-0.33f*(2.0f*(0.5f-fabs(y-0.5f))));
-	//return skybox_color_sunset(x, y);
-	const float fu = (float)def_skybox_width *fma(atan2(r.direction.x, r.direction.y),  0.5f/3.1415927f, 0.5f);
-	const float fv = (float)def_skybox_height*fma(asin (r.direction.z               ), -1.0f/3.1415927f, 0.5f);
+	const float3 direction = normalize(r.direction); // to avoid artifacts from asin(direction.z)
+	//const float x = fma(atan2(direction.x, direction.y),  0.5f/3.1415927f, 0.5f);
+	//const float y = fma(asin (direction.z             ), -1.0f/3.1415927f, 0.5f);
+	//return skybox_color_bw(x, y);
+	//return color_mix(skybox_color_hsv(x, y), skybox_color_grid(x, y, 0xFFFFFF, 0x000000), 0.95f-0.33f*(2.0f*(0.5f-fabs(y-0.5f))));
+	//return skybox_bottom(r, 0xFFFFFF, 0xF0F0F0, skybox_color_grid(x, y, 0xFFFFFF, 0xF0F0F0));
+	const float fu = (float)def_skybox_width *fma(atan2(direction.x, direction.y),  0.5f/3.1415927f, 0.5f);
+	const float fv = (float)def_skybox_height*fma(asin (direction.z             ), -1.0f/3.1415927f, 0.5f);
 	const int ua=clamp((int)fu, 0, (int)def_skybox_width-1), va=clamp((int)fv, 0, (int)def_skybox_height-1), ub=(ua+1)%def_skybox_width, vb=min(va+1, (int)def_skybox_height-1); // bilinear interpolation positions
 	const uint s00=skybox[ua+va*def_skybox_width], s01=skybox[ua+vb*def_skybox_width], s10=skybox[ub+va*def_skybox_width], s11=skybox[ub+vb*def_skybox_width];
 	const float u1=fu-(float)ua, v1=fv-(float)va, u0=1.0f-u1, v0=1.0f-v1; // interpolation factors
 	return color_mix(color_mix(s00, s01, v0), color_mix(s10, s11, v0), u0); // perform bilinear interpolation
 }
-)+R(uint last_ray_reflectivity(const ray reflection, const ray transmission, const uint last_color, const float reflectivity, const global int* skybox) {
-	return color_mix(skybox_color(reflection, skybox), skybox_color(transmission, skybox), reflectivity);
+)+R(uint last_ray_reflectivity(const ray reflection, const ray transmission, const float reflectivity, const float transmissivity, const global int* skybox) {
+	return color_mix(skybox_color(reflection, skybox), color_mix(skybox_color(transmission, skybox), def_absorption_color, transmissivity), reflectivity);
 }
 )+R(float ray_grid_traverse(const ray r, const global float* phi, const global uchar* flags, float3* normal, const uint Nx, const uint Ny, const uint Nz) {
 	const float3 pa = r.origin;
@@ -739,34 +743,38 @@ string opencl_c_container() { return R( // ########################## begin of O
 	ray_reflect->direction = reflect(ray_in.direction, normal);
 	return true;
 }
-)+R(bool raytrace_phi(const ray ray_in, ray* ray_reflect, ray* ray_transmit, float* reflectivity, const global float* phi, const global uchar* flags, const global int* skybox, const uint Nx, const uint Ny, const uint Nz) {
+)+R(bool raytrace_phi(const ray ray_in, ray* ray_reflect, ray* ray_transmit, float* reflectivity, float* transmissivity, const global float* phi, const global uchar* flags, const global int* skybox, const uint Nx, const uint Ny, const uint Nz) {
 	float3 normal;
 	float d = ray_grid_traverse(ray_in, phi, flags, &normal, Nx, Ny, Nz); // move ray through lattice, at each cell call marching_cubes
 	if(d==-1.0f) return false; // no intersection found
+	const float ray_in_normal = dot(ray_in.direction, normal);
+	const bool is_inside = ray_in_normal>0.0f; // camera is in fluid
 	ray_reflect->origin = ray_in.origin+(d-0.0003163f)*ray_in.direction; // start intersection points a bit in front triangle to avoid self-reflection
 	ray_reflect->direction = reflect(ray_in.direction, normal); // compute reflection ray
 	ray ray_internal; // compute internal ray and transmission ray
 	ray_internal.origin = ray_in.origin+(d+0.0003163f)*ray_in.direction; // start intersection points a bit behind triangle to avoid self-transmission
 	ray_internal.direction = refract(ray_in.direction, normal, def_n);
-	const bool is_inside = dot(ray_in.direction, normal)>0.0f; // camera is in fluid
+	const float wr = clamp(sq(cb(2.0f*acospi(fabs(ray_in_normal)))), 0.0f, 1.0f); // increase reflectivity if ray intersects surface at shallow angle
 	if(is_inside) { // swap ray_reflect and ray_internal
 		const float3 ray_internal_origin = ray_internal.origin;
-		ray_internal.origin = ray_reflect->origin; // start intersection points a bit in front triangle to avoid self-reflection
+		ray_internal.origin = ray_reflect->origin;
 		ray_internal.direction = ray_reflect->direction;
-		ray_reflect->origin = ray_internal_origin; // start intersection points a bit behind triangle to avoid self-transmission
-		ray_reflect->direction = refract(ray_in.direction, -normal, 1.0f/def_n);
-	}
-	const float wr = sq(cb(2.0f*acospi(fabs(dot(ray_in.direction, normal))))); // increase reflectivity if ray intersects surface at shallow angle
-	*reflectivity = clamp(is_inside ? 1.0f-wr : wr, 0.0f, 1.0f); // ray_reflect and ray_transmit are switched if camera is in fluid
-	d = ray_grid_traverse(ray_internal, phi, flags, &normal, Nx, Ny, Nz);
-	if(d!=-1.0f) { // internal ray intersects isosurface
-		const float3 intersection_point = ray_internal.origin+(d+0.0003163f)*ray_internal.direction; // start intersection points a bit behind triangle to avoid self-transmission
-		ray_transmit->origin = intersection_point;
-		ray_transmit->direction = refract(ray_internal.direction, -normal, 1.0f/def_n);
-	} else { // internal ray does not intersect again
-		ray_transmit->origin = ray_internal.origin;
-		ray_transmit->direction = ray_internal.direction;
+		ray_reflect->origin = ray_internal_origin; // re-use internal ray origin
+		ray_reflect->direction = refract(ray_in.direction, -normal, 1.0f/def_n); // compute refraction again: refract out of fluid
+		if(sq(1.0f/def_n)-1.0f+sq(ray_in_normal)>=0.0f) { // refraction through Snell's window
+			ray_transmit->origin = ray_reflect->origin; // reflection ray and transmission ray are the same
+			ray_transmit->direction = ray_reflect->direction;
+			*reflectivity = 0.0f;
+			*transmissivity = exp(def_attenuation*d); // Beer-Lambert law
+			return true;
+		}
 	}
+	float d_internal = d;
+	d = ray_grid_traverse(ray_internal, phi, flags, &normal, Nx, Ny, Nz); // 2nd ray-grid traversal call: refraction (camera outside) or total internal reflection (camera inside)
+	ray_transmit->origin = d!=-1.0f ? ray_internal.origin+(d+0.0003163f)*ray_internal.direction : ray_internal.origin; // start intersection points a bit behind triangle to avoid self-transmission
+	ray_transmit->direction = d!=-1.0f ? refract(ray_internal.direction, -normal, 1.0f/def_n) : ray_internal.direction; // internal ray intersects isosurface : internal ray does not intersect again
+	*reflectivity = is_inside ? 0.0f : wr; // is_inside means camera is inside fluid, so this is a total internal reflection down here
+	*transmissivity = d!=-1.0f ? exp(def_attenuation*((float)is_inside*d_internal+d)) : (float)(def_attenuation==0.0f); // Beer-Lambert law
 	return true;
 }
 )+R(bool is_above_plane(const float3 point, const float3 plane_p, const float3 plane_n) {
@@ -2597,25 +2605,24 @@ string opencl_c_container() { return R( // ########################## begin of O
 	}
 }
 
-)+R(int raytrace_phi_next_ray(const ray reflection, const ray transmission, const int pixelcolor, const float reflectivity, const global float* phi, const global uchar* flags, const global int* skybox) {
-	int color_reflect=pixelcolor, color_transmit=pixelcolor;
+)+R(int raytrace_phi_next_ray(const ray reflection, const ray transmission, const float reflectivity, const float transmissivity, const global float* phi, const global uchar* flags, const global int* skybox) {
+	int color_reflect=0, color_transmit=0;
 	ray reflection_next, transmission_next;
-	float reflection_reflectivity, transmission_reflectivity;
-	if(raytrace_phi(reflection, &reflection_next, &transmission_next, &reflection_reflectivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
-		color_reflect = last_ray_reflectivity(reflection_next, transmission_next, color_reflect, reflection_reflectivity, skybox);
+	float reflection_reflectivity, reflection_transmissivity, transmission_reflectivity, transmission_transmissivity;
+	if(raytrace_phi(reflection, &reflection_next, &transmission_next, &reflection_reflectivity, &reflection_transmissivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
+		color_reflect = last_ray_reflectivity(reflection_next, transmission_next, reflection_reflectivity, reflection_transmissivity, skybox);
 	} else {
 		color_reflect = skybox_color(reflection, skybox);
 	}
-	if(raytrace_phi(transmission, &reflection_next, &transmission_next, &transmission_reflectivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
-		color_transmit = last_ray_reflectivity(reflection_next, transmission_next, color_transmit, transmission_reflectivity, skybox);
+	if(raytrace_phi(transmission, &reflection_next, &transmission_next, &transmission_reflectivity, &transmission_transmissivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
+		color_transmit = last_ray_reflectivity(reflection_next, transmission_next, transmission_reflectivity, transmission_transmissivity, skybox);
 	} else {
 		color_transmit = skybox_color(transmission, skybox);
 	}
-	return color_mix(color_reflect, color_transmit, reflectivity);
+	return color_mix(color_reflect, color_mix(color_transmit, def_absorption_color, transmissivity), reflectivity);
 }
-
-)+R(int raytrace_phi_next_ray_mirror(const ray reflection, const int pixelcolor, const global float* phi, const global uchar* flags, const global int* skybox) {
-	int color_reflect = pixelcolor;
+)+R(int raytrace_phi_next_ray_mirror(const ray reflection, const global float* phi, const global uchar* flags, const global int* skybox) {
+	int color_reflect = 0;
 	ray reflection_next;
 	if(raytrace_phi_mirror(reflection, &reflection_next, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
 		color_reflect = skybox_color(reflection_next, skybox);
@@ -2637,20 +2644,20 @@ string opencl_c_container() { return R( // ########################## begin of O
 	float camera_cache[15]; // cache parameters in case the kernel draws more than one shape
 	for(uint i=0u; i<15u; i++) camera_cache[i] = camera[i];
 	ray camray = get_camray(x, y, camera_cache);
-	int pixelcolor = 0;
 	const float distance = intersect_cuboid(camray, (float3)(0.0f, 0.0f, 0.0f), (float)def_Nx, (float)def_Ny, (float)def_Nz);
 	camray.origin = camray.origin+fmax(distance, 0.0f)*camray.direction;
 	ray reflection, transmission; // reflection and transmission
-	float reflectivity;
-	if(raytrace_phi(camray, &reflection, &transmission, &reflectivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
-		pixelcolor = last_ray_reflectivity(reflection, transmission, pixelcolor, reflectivity, skybox); // 1 ray pass
-		//pixelcolor = raytrace_phi_next_ray(reflection, transmission, pixelcolor, reflectivity, phi, flags, skybox); // 2 ray passes
+	float reflectivity, transmissivity;
+	int pixelcolor = 0;
+	if(raytrace_phi(camray, &reflection, &transmission, &reflectivity, &transmissivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
+		pixelcolor = last_ray_reflectivity(reflection, transmission, reflectivity, transmissivity, skybox); // 1 ray pass
+		//pixelcolor = raytrace_phi_next_ray(reflection, transmission, reflectivity, transmissivity, phi, flags, skybox); // 2 ray passes
 	} else {
 		pixelcolor = skybox_color(camray, skybox);
 	}
 	//if(raytrace_phi_mirror(camray, &reflection, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) { // reflection only
 	//	//pixelcolor = skybox_color(reflection, skybox); // 1 ray pass
-	//	pixelcolor = raytrace_phi_next_ray_mirror(reflection, pixelcolor, phi, flags, skybox); // 2 ray passes
+	//	pixelcolor = raytrace_phi_next_ray_mirror(reflection, phi, flags, skybox); // 2 ray passes
 	//} else {
 	//	pixelcolor = skybox_color(camray, skybox);
 	//}
diff --git a/src/lbm.cpp b/src/lbm.cpp
index b4ae40cb..9040818c 100644
--- a/src/lbm.cpp
+++ b/src/lbm.cpp
@@ -184,7 +184,7 @@ void LBM_Domain::voxelize_mesh_on_device(const Mesh* mesh, const uchar flag, con
 	Memory<float3> p1(device, mesh->triangle_number, 1u, mesh->p1);
 	Memory<float3> p2(device, mesh->triangle_number, 1u, mesh->p2);
 	Memory<float> bounding_box_and_velocity(device, 16u);
-	const float x0=mesh->pmin.x, y0=mesh->pmin.y, z0=mesh->pmin.z, x1=mesh->pmax.x, y1=mesh->pmax.y, z1=mesh->pmax.z; // use bounding box of mesh to speed up voxelization
+	const float x0=mesh->pmin.x-2.0f, y0=mesh->pmin.y-2.0f, z0=mesh->pmin.z-2.0f, x1=mesh->pmax.x+2.0f, y1=mesh->pmax.y+2.0f, z1=mesh->pmax.z+2.0f; // use bounding box of mesh to speed up voxelization; add tolerance of 2 cells for re-voxelization of moving objects
 	bounding_box_and_velocity[ 0] = as_float(mesh->triangle_number);
 	bounding_box_and_velocity[ 1] = x0;
 	bounding_box_and_velocity[ 2] = y0;
@@ -445,12 +445,14 @@ string LBM_Domain::Graphics::device_defines() const { return
 	"\n	#define def_background_color " +to_string(GRAPHICS_BACKGROUND_COLOR)+""
 	"\n	#define def_screen_width "     +to_string(camera.width)+"u"
 	"\n	#define def_screen_height "    +to_string(camera.height)+"u"
-	"\n	#define def_n "                +to_string(1.333f)+"f" // refractive index of water
 	"\n	#define def_scale_u "          +to_string(1.0f/(0.57735027f*(GRAPHICS_U_MAX)))+"f"
 	"\n	#define def_scale_Q_min "      +to_string(GRAPHICS_Q_CRITERION)+"f"
 	"\n	#define def_scale_F "          +to_string(GRAPHICS_BOUNDARY_FORCE_SCALE)+"f"
 	"\n	#define def_streamline_sparse "+to_string(GRAPHICS_STREAMLINE_SPARSE)+"u"
 	"\n	#define def_streamline_length "+to_string(GRAPHICS_STREAMLINE_LENGTH)+"u"
+	"\n	#define def_n "                +to_string(1.333f)+"f" // refractive index of water for raytracing graphics
+	"\n	#define def_attenuation "      +to_string(ln(GRAPHICS_RAYTRACING_TRANSMITTANCE)/(float)max(max(lbm->get_Nx(), lbm->get_Ny()), lbm->get_Nz()))+"f" // (negative) attenuation parameter for raytracing graphics
+	"\n	#define def_absorption_color " +to_string(GRAPHICS_RAYTRACING_COLOR)+"" // absorption color of fluid for raytracing graphics
 
 	"\n	#define COLOR_S (127<<16|127<<8|127)" // (stationary or moving) solid boundary
 	"\n	#define COLOR_E (  0<<16|255<<8|  0)" // equilibrium boundary (inflow/outflow)
diff --git a/src/main.cpp b/src/main.cpp
index cc063bfa..364a0c07 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -28,6 +28,16 @@ void main_label(const double frametime) {
 		if(!key_H) {
 			draw_label(camera.width-16*(FONT_WIDTH)-1, 2, "Press H for Help", c);
 		} else {
+#ifdef SURFACE
+			const bool surface = true;
+#else // SURFACE
+			const bool surface = false;
+#endif // SURFACE
+#ifdef PARTICLES
+			const bool particles = true;
+#else // PARTICLES
+			const bool particles = false;
+#endif // PARTICLES
 			const int ox=2, oy=2;
 			int i = 0;
 			draw_label(ox, oy+i, "Keyboard/Mouse Controls: ", c); i+=2*FONT_HEIGHT;
@@ -37,8 +47,9 @@ void main_label(const double frametime) {
 			draw_label(ox, oy+i, "2 ("+string(key_2?" active ":"inactive")+"): velocity field", c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "3 ("+string(key_3?" active ":"inactive")+"): streamlines", c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "4 ("+string(key_4?" active ":"inactive")+"): vorticity / velocity-colored Q-criterion isosurface", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "5 ("+string(key_5?" active ":"inactive")+"): rasterized free surface", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "6 ("+string(info.lbm->get_D()==1u ? (key_6?" active ":"inactive") : "disabled")+"): raytraced free surface", c); i+=2*FONT_HEIGHT;
+			draw_label(ox, oy+i, "5 ("+string(surface ? (key_5?" active ":"inactive") : "disabled")+"): rasterized free surface", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "6 ("+string(surface&&info.lbm->get_D()==1u ? (key_6?" active ":"inactive") : "disabled")+"): raytraced free surface", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "7 ("+string(particles ? (key_7?" active ":"inactive") : "disabled")+"): particles", c); i+=2*FONT_HEIGHT;
 			draw_label(ox, oy+i, "Mouse or I/J/K/L (rx="+alignr(4u, to_int(fmod(degrees(camera.rx)+90.0+360.0, 360.0)-180.0))+", ry="+alignr(3u, to_int(180.0-degrees(camera.ry)))+"): rotate camera", c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "Scrollwheel or +/- ("+to_string(camera.free ? (float)camera.free_camera_velocity : camera.zoom*(float)fmax(fmax(info.lbm->get_Nx(), info.lbm->get_Ny()), info.lbm->get_Nz())/(float)min(camera.width, camera.height), 3u)+"): zoom (centered camera mode) or camera movement speed (free camera mode)", c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "Mouseclick or U: toggle rotation with Mouse and angle snap rotation with I/J/K/L", c); i+=FONT_HEIGHT;
diff --git a/src/setup.cpp b/src/setup.cpp
index 8ddad3cf..57446cf9 100644
--- a/src/setup.cpp
+++ b/src/setup.cpp
@@ -500,7 +500,7 @@
 
 /*void main_setup() { // F1 car
 	// ######################################################### define simulation box size, viscosity and volume force ############################################################################
-	const uint L = 256u; // 2152u on 8x MI200
+	const uint L = 512u; // 2152u on 8x MI200
 	const float kmh = 100.0f;
 	const float si_u = kmh/3.6f;
 	const float si_x = 2.0f;
@@ -519,7 +519,7 @@
 	lbm.voxelize_stl(get_exe_path()+"../stl/Ferrari_SF71H_V5.stl", center, size); // https://www.thingiverse.com/thing:2990512/files (unfortunately, this model is not available anymore)
 	const ulong N=lbm.get_N(); const uint Nx=lbm.get_Nx(), Ny=lbm.get_Ny(), Nz=lbm.get_Nz(); for(ulong n=0ull; n<N; n++) { uint x=0u, y=0u, z=0u; lbm.coordinates(n, x, y, z);
 		// ########################################################################### define geometry #############################################################################################
-		//if(lbm.flags[n]!=TYPE_S) lbm.u.y[n] = u;
+		if(lbm.flags[n]!=TYPE_S) lbm.u.y[n] = u;
 		if(x==0u||x==Nx-1u||y==0u||y==Ny-1u||z==Nz-1u) lbm.flags[n] = TYPE_E;
 		if(z==0u) lbm.flags[n] = TYPE_S;
 		const float3 p = lbm.position(x, y, z);