From 0fbd48d84f5bf67a4baccc68c896d92a7300894e Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Tue, 18 Apr 2023 09:41:39 +0200
Subject: [PATCH 01/15] Added more benchmarks in Readme

---
 README.md | 116 +++++++++++++++++++++++++++---------------------------
 1 file changed, 59 insertions(+), 57 deletions(-)
diff --git a/README.md b/README.md
index ec56d81b..c9c9d8c0 100644
--- a/README.md
+++ b/README.md
@@ -320,33 +320,34 @@ If your GPU is not on the list yet, you can report your benchmarks [here](https:
 
 Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 
-| Device                                         | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
-| :--------------------------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
-|                                                |                    |             |              |                        |                         |                         |
-| 🔴&nbsp;Instinct&nbsp;MI250&nbsp;(1&nbsp;GCD)  |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
-| 🔴&nbsp;Radeon&nbsp;VII                        |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
+| Device                                          | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
+| :---------------------------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
+|                                                 |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;Instinct&nbsp;MI250&nbsp;(1&nbsp;GCD)   |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
+| 🔴&nbsp;Radeon&nbsp;VII                         |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
 | 🟢&nbsp;H100&nbsp;PCIe&nbsp;80GB                |              51.01 |          80 |         2000 |       11128&nbsp;(85%) |             20624 (79%) |             13862 (53%) |
 | 🟢&nbsp;A100&nbsp;SXM4&nbsp;80GB                |              19.49 |          80 |         2039 |       10228&nbsp;(77%) |             18448 (70%) |             11197 (42%) |
 | 🟢&nbsp;A100&nbsp;SXM4&nbsp;40GB                |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
 | 🟢&nbsp;A100&nbsp;PCIe&nbsp;40GB                |              19.49 |          40 |         1555 |             8526 (84%) |             16035 (79%) |             11088 (55%) |
 | 🟢&nbsp;Tesla&nbsp;V100&nbsp;16GB               |              14.13 |          16 |          900 |             5128 (87%) |             10325 (88%) |              7683 (66%) |
 | 🟢&nbsp;Quadro&nbsp;GV100                       |              16.66 |          32 |          870 |             3442 (61%) |              6641 (59%) |              5863 (52%) |
+| 🟢&nbsp;Titan&nbsp;V                            |              14.90 |          12 |          653 |             3601 (84%) |              7253 (86%) |              6957 (82%) |
 | 🟢&nbsp;Tesla&nbsp;P100&nbsp;16GB               |               9.52 |          16 |          732 |             3295 (69%) |              5950 (63%) |              4176 (44%) |
 | 🟢&nbsp;Tesla&nbsp;P100&nbsp;12GB               |               9.52 |          12 |          549 |             2427 (68%) |              4141 (58%) |              3999 (56%) |
 | 🟢&nbsp;Tesla&nbsp;K40m                         |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
 | 🟢&nbsp;Tesla&nbsp;K80&nbsp;(1&nbsp;GPU)        |               4.11 |          12 |          240 |              916 (58%) |              1642 (53%) |               943 (30%) |
 | 🟢&nbsp;Tesla&nbsp;K20c                         |               3.52 |           5 |          208 |              861 (63%) |              1507 (56%) |               720 (27%) |
-|                                                |                    |             |              |                        |                         |                         |
-| 🔴&nbsp;Radeon&nbsp;RX&nbsp;7900&nbsp;XTX      |              61.44 |          24 |          960 |             3665 (58%) |              7644 (61%) |              7716 (62%) |
-| 🔴&nbsp;Radeon&nbsp;RX&nbsp;7900&nbsp;XT       |              51.61 |          20 |          800 |             3013 (58%) |              5856 (56%) |              5986 (58%) |
-| 🔴&nbsp;Radeon&nbsp;RX&nbsp;6900&nbsp;XT       |              23.04 |          16 |          512 |             1968 (59%) |              4227 (64%) |              4207 (63%) |
-| 🔴&nbsp;Radeon&nbsp;RX&nbsp;6800&nbsp;XT       |              20.74 |          16 |          512 |             2008 (60%) |              4241 (64%) |              4224 (64%) |
-| 🔴&nbsp;Radeon&nbsp;RX&nbsp;5700&nbsp;XT       |               9.75 |           8 |          448 |             1368 (47%) |              3253 (56%) |              3049 (52%) |
-| 🔴&nbsp;Radeon&nbsp;RX&nbsp;Vega&nbsp;64       |              13.35 |           8 |          484 |             1875 (59%) |              2878 (46%) |              3227 (51%) |
-| 🔴&nbsp;Radeon&nbsp;RX&nbsp;580&nbsp;4GB       |               6.50 |           4 |          256 |              946 (57%) |              1848 (56%) |              1577 (47%) |
-| 🔴&nbsp;Radeon&nbsp;HD&nbsp;7850               |               1.84 |           2 |          154 |              112 (11%) |               120 ( 6%) |               635 (32%) |
-| 🔵&nbsp;Arc&nbsp;A770&nbsp;LE                  |              19.66 |          16 |          560 |             2741 (75%) |              4591 (63%) |              4626 (64%) |
-| 🔵&nbsp;Arc&nbsp;A750&nbsp;LE                  |              17.20 |           8 |          512 |             2625 (78%) |              4184 (63%) |              4238 (64%) |
+|                                                 |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;7900&nbsp;XTX       |              61.44 |          24 |          960 |             3665 (58%) |              7644 (61%) |              7716 (62%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;7900&nbsp;XT        |              51.61 |          20 |          800 |             3013 (58%) |              5856 (56%) |              5986 (58%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;6900&nbsp;XT        |              23.04 |          16 |          512 |             1968 (59%) |              4227 (64%) |              4207 (63%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;6800&nbsp;XT        |              20.74 |          16 |          512 |             2008 (60%) |              4241 (64%) |              4224 (64%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;5700&nbsp;XT        |               9.75 |           8 |          448 |             1368 (47%) |              3253 (56%) |              3049 (52%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;Vega&nbsp;64        |              13.35 |           8 |          484 |             1875 (59%) |              2878 (46%) |              3227 (51%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;580&nbsp;4GB        |               6.50 |           4 |          256 |              946 (57%) |              1848 (56%) |              1577 (47%) |
+| 🔴&nbsp;Radeon&nbsp;HD&nbsp;7850                |               1.84 |           2 |          154 |              112 (11%) |               120 ( 6%) |               635 (32%) |
+| 🔵&nbsp;Arc&nbsp;A770&nbsp;LE                   |              19.66 |          16 |          560 |             2741 (75%) |              4591 (63%) |              4626 (64%) |
+| 🔵&nbsp;Arc&nbsp;A750&nbsp;LE                   |              17.20 |           8 |          512 |             2625 (78%) |              4184 (63%) |              4238 (64%) |
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;4090              |              82.58 |          24 |         1008 |             5624 (85%) |             11091 (85%) |             11496 (88%) |
 | 🟢&nbsp;RTX&nbsp;6000&nbsp;Ada                  |              91.10 |          48 |          960 |             4997 (80%) |             10249 (82%) |             10293 (83%) |
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;4080              |              55.45 |          16 |          717 |             3914 (84%) |              7626 (82%) |              7933 (85%) |
@@ -361,6 +362,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3060              |              13.17 |          12 |          360 |             2108 (90%) |              4070 (87%) |              3566 (76%) |
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3060M             |              10.94 |           6 |          336 |             2019 (92%) |              4012 (92%) |              3572 (82%) |
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050M             |               7.13 |           4 |          192 |             1180 (94%) |              2339 (94%) |              2016 (81%) |
+| 🟢&nbsp;Titan&nbsp;RTX                          |              16.31 |          24 |          672 |             3471 (79%) |              7456 (85%) |              7554 (87%) |
 | 🟢&nbsp;Quadro&nbsp;RTX&nbsp;6000               |              16.31 |          24 |          672 |             3307 (75%) |              6836 (78%) |              6879 (79%) |
 | 🟢&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Pass.    |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
 | 🟢&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti      |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
@@ -388,27 +390,27 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🟢&nbsp;Quadro&nbsp;K2000                       |               0.73 |           2 |           64 |              312 (75%) |               444 (53%) |               171 (21%) |
 | 🟢&nbsp;GeForce&nbsp;GT&nbsp;630&nbsp;(OEM)     |               0.46 |           2 |           29 |              151 (81%) |               185 (50%) |                78 (21%) |
 | 🟢&nbsp;Quadro&nbsp;NVS&nbsp;290                |               0.03 |       0.256 |            6 |                1 ( 2%) |                 1 ( 1%) |                 1 ( 1%) |
-|                                                |                    |             |              |                        |                         |                         |
+|                                                 |                    |             |              |                        |                         |                         |
 | 🟣&nbsp;M1&nbsp;Pro&nbsp;GPU&nbsp;16C&nbsp;16GB |               4.10 |          11 |          200 |             1204 (92%) |              2329 (90%) |              1855 (71%) |
-| 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(4750G)  |               2.15 |          27 |           57 |              263 (71%) |               511 (70%) |               501 (68%) |
-| 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(3500U)  |               1.23 |           7 |           38 |              157 (63%) |               282 (57%) |               288 (58%) |
-| 🔵&nbsp;UHD&nbsp;Graphics&nbsp;630            |               0.46 |           7 |           51 |              151 (45%) |               301 (45%) |               187 (28%) |
-| 🔵&nbsp;HD&nbsp;Graphics&nbsp;5500            |               0.35 |           3 |           26 |               75 (45%) |               192 (58%) |               108 (32%) |
-| 🔵&nbsp;HD&nbsp;Graphics&nbsp;4600            |               0.38 |           2 |           26 |              105 (63%) |               115 (35%) |                34 (10%) |
+| 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(4750G)    |               2.15 |          27 |           57 |              263 (71%) |               511 (70%) |               501 (68%) |
+| 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(3500U)    |               1.23 |           7 |           38 |              157 (63%) |               282 (57%) |               288 (58%) |
+| 🔵&nbsp;UHD&nbsp;Graphics&nbsp;630              |               0.46 |           7 |           51 |              151 (45%) |               301 (45%) |               187 (28%) |
+| 🔵&nbsp;HD&nbsp;Graphics&nbsp;5500              |               0.35 |           3 |           26 |               75 (45%) |               192 (58%) |               108 (32%) |
+| 🔵&nbsp;HD&nbsp;Graphics&nbsp;4600              |               0.38 |           2 |           26 |              105 (63%) |               115 (35%) |                34 (10%) |
 | 🟡&nbsp;ARM&nbsp;Mali-G72&nbsp;MP18             |               0.24 |           4 |           29 |               14 ( 7%) |                17 ( 5%) |                12 ( 3%) |
-|                                                |                    |             |              |                        |                         |                         |
-| 🔴&nbsp;2x&nbsp;EPYC&nbsp;9654                |              29.49 |        1536 |          922 |             1381 (23%) |              1814 (15%) |              1801 (15%) |
-| 🔵&nbsp;Xeon&nbsp;Phi&nbsp;7210               |               5.32 |         192 |          102 |              415 (62%) |               193 (15%) |               223 (17%) |
-| 🔵&nbsp;4x&nbsp;Xeon&nbsp;E5-4620&nbsp;v4     |               2.69 |         512 |          273 |              460 (26%) |               275 ( 8%) |               239 ( 7%) |
-| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2630&nbsp;v4     |               1.41 |          64 |          137 |              264 (30%) |               146 ( 8%) |               129 ( 7%) |
-| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2623&nbsp;v4     |               0.67 |          64 |          137 |              125 (14%) |                66 ( 4%) |                59 ( 3%) |
-| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2680&nbsp;v3     |               1.92 |          64 |          137 |              209 (23%) |               305 (17%) |               281 (16%) |
-| 🔵&nbsp;Core&nbsp;i9-10980XE                  |               3.23 |         128 |           94 |              286 (47%) |               251 (21%) |               223 (18%) |
-| 🔵&nbsp;Core&nbsp;i5-9600                     |               0.60 |          16 |           43 |              146 (52%) |               127 (23%) |               147 (27%) |
-| 🔵&nbsp;Core&nbsp;i7-8700K                    |               0.71 |          16 |           51 |              152 (45%) |               134 (20%) |               116 (17%) |
-| 🔵&nbsp;Core&nbsp;i7-7700HQ                   |               0.36 |          12 |           38 |               81 (32%) |                82 (16%) |               108 (22%) |
-| 🔵&nbsp;Core&nbsp;i7-4770                     |               0.44 |          16 |           26 |              104 (62%) |                69 (21%) |                59 (18%) |
-| 🔵&nbsp;Core&nbsp;i7-4720HQ                   |               0.33 |          16 |           26 |               58 (35%) |                13 ( 4%) |                47 (14%) |
+|                                                 |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;2x&nbsp;EPYC&nbsp;9654                  |              29.49 |        1536 |          922 |             1381 (23%) |              1814 (15%) |              1801 (15%) |
+| 🔵&nbsp;Xeon&nbsp;Phi&nbsp;7210                 |               5.32 |         192 |          102 |              415 (62%) |               193 (15%) |               223 (17%) |
+| 🔵&nbsp;4x&nbsp;Xeon&nbsp;E5-4620&nbsp;v4       |               2.69 |         512 |          273 |              460 (26%) |               275 ( 8%) |               239 ( 7%) |
+| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2630&nbsp;v4       |               1.41 |          64 |          137 |              264 (30%) |               146 ( 8%) |               129 ( 7%) |
+| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2623&nbsp;v4       |               0.67 |          64 |          137 |              125 (14%) |                66 ( 4%) |                59 ( 3%) |
+| 🔵&nbsp;2x&nbsp;Xeon&nbsp;E5-2680&nbsp;v3       |               1.92 |          64 |          137 |              209 (23%) |               305 (17%) |               281 (16%) |
+| 🔵&nbsp;Core&nbsp;i9-10980XE                    |               3.23 |         128 |           94 |              286 (47%) |               251 (21%) |               223 (18%) |
+| 🔵&nbsp;Core&nbsp;i5-9600                       |               0.60 |          16 |           43 |              146 (52%) |               127 (23%) |               147 (27%) |
+| 🔵&nbsp;Core&nbsp;i7-8700K                      |               0.71 |          16 |           51 |              152 (45%) |               134 (20%) |               116 (17%) |
+| 🔵&nbsp;Core&nbsp;i7-7700HQ                     |               0.36 |          12 |           38 |               81 (32%) |                82 (16%) |               108 (22%) |
+| 🔵&nbsp;Core&nbsp;i7-4770                       |               0.44 |          16 |           26 |              104 (62%) |                69 (21%) |                59 (18%) |
+| 🔵&nbsp;Core&nbsp;i7-4720HQ                     |               0.33 |          16 |           26 |               58 (35%) |                13 ( 4%) |                47 (14%) |
 
 
 
@@ -418,39 +420,39 @@ Multi-GPU benchmarks are done at the largest possible grid resolution with a cub
 
 Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 
-| Device                                                         | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
-| :------------------------------------------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
-|                                                                |                    |             |              |                        |                         |                         |
-| 🔴&nbsp;1x&nbsp;Instinct&nbsp;MI250&nbsp;(1&nbsp;GCD)          |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
-| 🔴&nbsp;1x&nbsp;Instinct&nbsp;MI250&nbsp;(2&nbsp;GCD)          |              90.52 |         128 |         3277 |            9460 (1.7x) |            14313 (1.6x) |            17338 (2.0x) |
-| 🔴&nbsp;2x&nbsp;Instinct&nbsp;MI250&nbsp;(4&nbsp;GCD)          |             181.04 |         256 |         6554 |           16925 (3.0x) |            29163 (3.2x) |            29627 (3.5x) |
-| 🔴&nbsp;4x&nbsp;Instinct&nbsp;MI250&nbsp;(8&nbsp;GCD)          |             362.08 |         512 |        13107 |           27350 (4.9x) |            52258 (5.8x) |            53521 (6.3x) |
-|                                                                |                    |             |              |                        |                         |                         |
-| 🔴&nbsp;1x&nbsp;Radeon&nbsp;VII                                |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
-| 🔴&nbsp;2x&nbsp;Radeon&nbsp;VII                                |              27.66 |          32 |         2048 |            8113 (1.7x) |            15591 (2.0x) |            10352 (2.0x) |
-| 🔴&nbsp;4x&nbsp;Radeon&nbsp;VII                                |              55.32 |          64 |         4096 |           12911 (2.6x) |            24273 (3.1x) |            17080 (3.2x) |
-| 🔴&nbsp;8x&nbsp;Radeon&nbsp;VII                                |             110.64 |         128 |         8192 |           21946 (4.5x) |            30826 (4.0x) |            24572 (4.7x) |
-|                                                                |                    |             |              |                        |                         |                         |
+| Device                                                          | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
+| :-------------------------------------------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
+|                                                                 |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;1x&nbsp;Instinct&nbsp;MI250&nbsp;(1&nbsp;GCD)           |              45.26 |          64 |         1638 |             5638 (53%) |              9030 (42%) |              8506 (40%) |
+| 🔴&nbsp;1x&nbsp;Instinct&nbsp;MI250&nbsp;(2&nbsp;GCD)           |              90.52 |         128 |         3277 |            9460 (1.7x) |            14313 (1.6x) |            17338 (2.0x) |
+| 🔴&nbsp;2x&nbsp;Instinct&nbsp;MI250&nbsp;(4&nbsp;GCD)           |             181.04 |         256 |         6554 |      16925&nbsp;(3.0x) |            29163 (3.2x) |            29627 (3.5x) |
+| 🔴&nbsp;4x&nbsp;Instinct&nbsp;MI250&nbsp;(8&nbsp;GCD)           |             362.08 |         512 |        13107 |      27350&nbsp;(4.9x) |            52258 (5.8x) |            53521 (6.3x) |
+|                                                                 |                    |             |              |                        |                         |                         |
+| 🔴&nbsp;1x&nbsp;Radeon&nbsp;VII                                 |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
+| 🔴&nbsp;2x&nbsp;Radeon&nbsp;VII                                 |              27.66 |          32 |         2048 |            8113 (1.7x) |            15591 (2.0x) |            10352 (2.0x) |
+| 🔴&nbsp;4x&nbsp;Radeon&nbsp;VII                                 |              55.32 |          64 |         4096 |      12911&nbsp;(2.6x) |            24273 (3.1x) |            17080 (3.2x) |
+| 🔴&nbsp;8x&nbsp;Radeon&nbsp;VII                                 |             110.64 |         128 |         8192 |      21946&nbsp;(4.5x) |            30826 (4.0x) |            24572 (4.7x) |
+|                                                                 |                    |             |              |                        |                         |                         |
 | 🟢&nbsp;1x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
-| 🟢&nbsp;2x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              38.98 |          80 |         3110 |           13629 (1.6x) |            24620 (1.5x) |            18850 (1.7x) |
-| 🟢&nbsp;4x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              77.96 |         160 |         6220 |           17978 (2.1x) |            30604 (1.9x) |            30627 (2.7x) |
-|                                                                |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;2x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              38.98 |          80 |         3110 |      13629&nbsp;(1.6x) |            24620 (1.5x) |            18850 (1.7x) |
+| 🟢&nbsp;4x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              77.96 |         160 |         6220 |      17978&nbsp;(2.1x) |            30604 (1.9x) |            30627 (2.7x) |
+|                                                                 |                    |             |              |                        |                         |                         |
 | 🟢&nbsp;1x&nbsp;Tesla&nbsp;K40m                                 |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
 | 🟢&nbsp;2x&nbsp;Tesla&nbsp;K40m                                 |               8.58 |          24 |          577 |            1971 (1.7x) |             3300 (1.8x) |             1801 (2.0x) |
 | 🟢&nbsp;3x&nbsp;K40m&nbsp;+&nbsp;1x&nbsp;Titan&nbsp;Xp          |              17.16 |          48 |         1154 |            3117 (2.8x) |             5174 (2.8x) |             3127 (3.4x) |
-|                                                                |                    |             |              |                        |                         |                         |
+|                                                                 |                    |             |              |                        |                         |                         |
 | 🟢&nbsp;1x&nbsp;RTX&nbsp;A6000                                  |              40.00 |          48 |          768 |             4421 (88%) |              8814 (88%) |              8533 (86%) |
 | 🟢&nbsp;2x&nbsp;RTX&nbsp;A6000                                  |              80.00 |          96 |         1536 |            8041 (1.8x) |            15026 (1.7x) |            14795 (1.7x) |
-| 🟢&nbsp;4x&nbsp;RTX&nbsp;A6000                                  |             160.00 |         192 |         3072 |           14314 (3.2x) |            27915 (3.2x) |            27227 (3.2x) |
-| 🟢&nbsp;8x&nbsp;RTX&nbsp;A6000                                  |             320.00 |         384 |         6144 |           19311 (4.4x) |            40063 (4.5x) |            39004 (4.6x) |
-|                                                                |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;4x&nbsp;RTX&nbsp;A6000                                  |             160.00 |         192 |         3072 |      14314&nbsp;(3.2x) |            27915 (3.2x) |            27227 (3.2x) |
+| 🟢&nbsp;8x&nbsp;RTX&nbsp;A6000                                  |             320.00 |         384 |         6144 |      19311&nbsp;(4.4x) |            40063 (4.5x) |            39004 (4.6x) |
+|                                                                 |                    |             |              |                        |                         |                         |
 | 🟢&nbsp;1x&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Pa.              |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
 | 🟢&nbsp;2x&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Pa.              |              29.86 |          96 |         1248 |            4767 (1.8x) |             9607 (1.8x) |            10214 (1.8x) |
-|                                                                |                    |             |              |                        |                         |                         |
+|                                                                 |                    |             |              |                        |                         |                         |
 | 🟢&nbsp;1x&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti              |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
 | 🟢&nbsp;2x&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti              |              26.90 |          22 |         1232 |            5085 (1.6x) |            10770 (1.6x) |            10922 (1.6x) |
 | 🟢&nbsp;4x&nbsp;GeForce&nbsp;RTX&nbsp;2080&nbsp;Ti              |              53.80 |          44 |         2464 |            9117 (2.9x) |            18415 (2.7x) |            18598 (2.7x) |
-| 🟢&nbsp;7x&nbsp;2080&nbsp;Ti&nbsp;+&nbsp;1x&nbsp;A100&nbsp;40GB |             107.60 |          88 |         4928 |           16146 (5.1x) |            33732 (5.0x) |            33857 (4.9x) |
+| 🟢&nbsp;7x&nbsp;2080&nbsp;Ti&nbsp;+&nbsp;1x&nbsp;A100&nbsp;40GB |             107.60 |          88 |         4928 |      16146&nbsp;(5.1x) |            33732 (5.0x) |            33857 (4.9x) |
 
 
 

From e1e626ec2777a569457395ee7ac15c30ddd9ff9b Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Wed, 19 Apr 2023 17:37:39 +0200
Subject: [PATCH 02/15] Made correction of wrong memory reporting on Intel Arc
 more robust

---
 src/opencl.hpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/opencl.hpp b/src/opencl.hpp
index 677a1c10..23d572d3 100644
--- a/src/opencl.hpp
+++ b/src/opencl.hpp
@@ -65,16 +65,12 @@ struct Device_Info {
 		cores = to_uint((float)compute_units*(nvidia+amd+intel+apple+arm)); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 		tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
 		if(intel==8.0f) { // fix wrong global memory reporting for Intel Arc GPUs
-			if(contains_any(name, {"A770", "0x56a0"})&&(memory==12992u)) memory = 16240u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 16GB
-			if(contains_any(name, {"A770", "0x56a0"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A770 8GB
-			if(contains_any(name, {"A750", "0x56a1"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A750 8GB
-			if(contains_any(name, {"A580", "0x56a2"})&&(memory== 6476u)) memory =  8096u; // fix wrong (80% on Windows) memory reporting on Intel Arc A580 8GB
-			if(contains_any(name, {"A380", "0x56a5"})&&(memory== 4844u)) memory =  6056u; // fix wrong (80% on Windows) memory reporting on Intel Arc A380 6GB
-			if(contains_any(name, {"A770", "0x56a0"})&&(memory==15473u)) memory = 16288u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 16GB
-			if(contains_any(name, {"A770", "0x56a0"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A770 8GB
-			if(contains_any(name, {"A750", "0x56a1"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A750 8GB
-			if(contains_any(name, {"A580", "0x56a2"})&&(memory== 7721u)) memory =  8128u; // fix wrong (95% on Linux) memory reporting on Intel Arc A580 8GB
-			if(contains_any(name, {"A380"  "0x56a5"})&&(memory== 5783u)) memory =  6088u; // fix wrong (95% on Linux) memory reporting on Intel Arc A380 6GB
+			if((contains(name, "A770")&&memory>=12602u&&memory<13416u)||(contains_any(name, {"A770", "A750", "A580"})&&memory>=6286u&&memory<6693u)||(contains(name, "A380")&&memory>=4705u&&memory<5010u)) { // 77.5%-82.5% reporting -> /0.8
+				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*5ull/4ull)/1048576ull); // fix wrong (80% on Windows) memory reporting on Intel Arc
+			}
+			if((contains_any(name, {"A770", "0x56a0"})&&memory>=15041u&&memory<15855u)||(contains_any(name, {"A770", "A750", "A580", "0x56a0", "0x56a1", "0x56a2"})&&memory>=7503u&&memory<7910u)||(contains_any(name, {"A380", "0x56a5"})&&memory>=5616u&&memory<5921u)) { // 92.5%-97.5% reporting -> /0.95
+				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*20ull/19ull)/1048576ull); // fix wrong (95% on Linux) memory reporting on Intel Arc
+			}
 		}
 		intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
 	}

From 6561d0551691e7a60514e7a73b6aa08f909b24b0 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Thu, 27 Apr 2023 18:39:15 +0200
Subject: [PATCH 03/15] Added more benchmarks in Readme

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c9c9d8c0..dc10f1f7 100644
--- a/README.md
+++ b/README.md
@@ -391,6 +391,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🟢&nbsp;GeForce&nbsp;GT&nbsp;630&nbsp;(OEM)     |               0.46 |           2 |           29 |              151 (81%) |               185 (50%) |                78 (21%) |
 | 🟢&nbsp;Quadro&nbsp;NVS&nbsp;290                |               0.03 |       0.256 |            6 |                1 ( 2%) |                 1 ( 1%) |                 1 ( 1%) |
 |                                                 |                    |             |              |                        |                         |                         |
+| 🟣&nbsp;M1&nbsp;Max&nbsp;GPU&nbsp;24C&nbsp;32GB |               6.14 |          22 |          400 |             2369 (91%) |              4496 (87%) |              2777 (53%) |
 | 🟣&nbsp;M1&nbsp;Pro&nbsp;GPU&nbsp;16C&nbsp;16GB |               4.10 |          11 |          200 |             1204 (92%) |              2329 (90%) |              1855 (71%) |
 | 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(4750G)    |               2.15 |          27 |           57 |              263 (71%) |               511 (70%) |               501 (68%) |
 | 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(3500U)    |               1.23 |           7 |           38 |              157 (63%) |               282 (57%) |               288 (58%) |

From 1f95c0ab7d01b7ab1de9a7f9ab3d89d72ae587db Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Sat, 29 Apr 2023 09:43:14 +0200
Subject: [PATCH 04/15] Updated Readme, added more benchmarks in Readme

---
 README.md | 108 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 60 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index dc10f1f7..b16526a7 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
 </details>
 
 
-## Compute Features
+## Compute Features - Getting the Memory Problem under Control
 
 - <details><summary>CFD model: lattice Boltzmann method (LBM)</summary>
 
@@ -211,39 +211,41 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 - [peak performance on GPUs](#single-gpu-benchmarks) (datacenter/gaming/professional/laptop), validated with roofline model
 - [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) and other algebraic optimization to minimize round-off error
 
+- <details><summary>powerful model extensions</summary>
+
+  - [boundary types](https://doi.org/10.15495/EPub_UBT_00005400)
+    - stationary mid-grid bounce-back boundaries (stationary solid boundaries)
+    - moving mid-grid bounce-back boundaries (moving solid boundaries)
+    - equilibrium boundaries (non-reflective inflow/outflow)
+    - temperature boundaries (fixed temperature)
+  - global force per volume (Guo forcing), can be modified on-the-fly
+  - local force per volume (force field)
+    - optional computation of forces from the fluid on solid boundaries
+  - state-of-the-art [free surface LBM](https://doi.org/10.3390/computation10060092) (FSLBM) implementation:
+    - [volume-of-fluid model](https://doi.org/10.15495/EPub_UBT_00005400)
+    - [fully analytic PLIC](https://doi.org/10.3390/computation10020021) for efficient curvature calculation
+    - improved mass conservation
+    - ultra efficient implementation with only [4 kernels](https://doi.org/10.3390/computation10060092) additionally to `stream_collide()` kernel
+  - thermal LBM to simulate thermal convection
+    - D3Q7 subgrid for thermal DDFs
+    - in-place streaming with [Esoteric-Pull](https://doi.org/10.3390/computation10060092) for thermal DDFs
+    - optional [FP16S or FP16C compression](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) for thermal DDFs with [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats)
+  - Smagorinsky-Lilly subgrid turbulence LES model to keep simulations with very large Reynolds number stable
+    <p align="center"><i>&Pi;<sub>&alpha;&beta;</sub></i> = &Sigma;<sub><i>i</i></sub> <i>e<sub>i&alpha;</sub></i> <i>e<sub>i&beta;</sub></i> (<i>f<sub>i</sub></i>   - <i>f<sub>i</sub></i><sup>eq-shifted</sup>)<br><br>Q = &Sigma;<sub><i>&alpha;&beta;</i></sub>   <i>&Pi;<sub>&alpha;&beta;</sub></i><sup>2</sup><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;______________________<br>&tau; = &frac12; (&tau;<sub>0</sub> + &radic; &tau;<sub>0</sub><sup>2</sup> + <sup>(16&radic;2)</sup>&#8725;<sub>(<i>3&pi;</i><sup>2</sup>)</sub> <sup>&radic;Q</sup>&#8725;<sub><i>&rho;</i></sub> )</p>
+  - particles with immersed-boundary method (either passive or 2-way-coupled, only supported with single-GPU)
 
-
-## Optional Compute Extensions
-
-- [boundary types](https://doi.org/10.15495/EPub_UBT_00005400)
-  - stationary mid-grid bounce-back boundaries (stationary solid boundaries)
-  - moving mid-grid bounce-back boundaries (moving solid boundaries)
-  - equilibrium boundaries (non-reflective inflow/outflow)
-  - temperature boundaries (fixed temperature)
-- global force per volume (Guo forcing), can be modified on-the-fly
-- local force per volume (force field)
-  - optional computation of forces from the fluid on solid boundaries
-- state-of-the-art [free surface LBM](https://doi.org/10.3390/computation10060092) (FSLBM) implementation:
-  - [volume-of-fluid model](https://doi.org/10.15495/EPub_UBT_00005400)
-  - [fully analytic PLIC](https://doi.org/10.3390/computation10020021) for efficient curvature calculation
-  - improved mass conservation
-  - ultra efficient implementation with only [4 kernels](https://doi.org/10.3390/computation10060092) additionally to `stream_collide()` kernel
-- thermal LBM to simulate thermal convection
-  - D3Q7 subgrid for thermal DDFs
-  - in-place streaming with [Esoteric-Pull](https://doi.org/10.3390/computation10060092) for thermal DDFs
-  - optional [FP16S or FP16C compression](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats) for thermal DDFs with [DDF-shifting](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats)
-- Smagorinsky-Lilly subgrid turbulence LES model to keep simulations with very large Reynolds number stable
-  <details><summary>&#9900; &nbsp;equations</summary><p align="center"><i>&Pi;<sub>&alpha;&beta;</sub></i> = &Sigma;<sub><i>i</i></sub> <i>e<sub>i&alpha;</sub></i> <i>e<sub>i&beta;</sub></i> (<i>f<sub>i</sub></i> - <i>f<sub>i</sub></i><sup>eq-shifted</sup>)<br><br>Q = &Sigma;<sub><i>&alpha;&beta;</i></sub> <i>&Pi;<sub>&alpha;&beta;</sub></i><sup>2</sup><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;______________________<br>&tau; = &frac12; (&tau;<sub>0</sub> + &radic; &tau;<sub>0</sub><sup>2</sup> + <sup>(16&radic;2)</sup>&#8725;<sub>(<i>3&pi;</i><sup>2</sup>)</sub> <sup>&radic;Q</sup>&#8725;<sub><i>&rho;</i></sub> )</p></details>
-- particles with immersed-boundary method (either passive or 2-way-coupled, only supported with single-GPU)
+  </details>
 
 
 
-## Graphics Features
+## Solving the Visualization Problem
 
-- on Windows and Linux: real time [interactive rasterization and raytracing graphics](https://www.researchgate.net/publication/360501260_Combined_scientific_CFD_simulation_and_interactive_raytracing_with_OpenCL)
-- on Windows and Linux (even in WSL and/or remote through SSH): real time interactive ASCII console graphics
-- rendering is fully parallelized for multi-GPU via seamless domain decomposition rasterization
-- with interactive graphics mode disabled, image resolution can be as large as VRAM allows for (132 Megapixel (16K) and above)
+- FluidX3D can do simulations so large that storing the volumetric data for later rendering becomes unmanageable (like 120GB for a single frame, hundreds of TeraByte for a video)
+- instead, FluidX3D allows [rendering raw simulation data directly in VRAM](https://www.researchgate.net/publication/360501260_Combined_scientific_CFD_simulation_and_interactive_raytracing_with_OpenCL), so no large volumetric files have to be exported to the hard disk (see my [technical talk](https://youtu.be/pD8JWAZ2f8o))
+- the rendering is so fast that it works interactively in real time for both rasterization and raytracing
+- if no monitor is available (like on a remote Linux server), there is an ASCII rendering mode to interactively visualize the simulation in the terminal (even in WSL and/or through SSH)
+- rendering is fully multi-GPU-parallelized via seamless domain decomposition rasterization
+- with interactive graphics mode disabled, image resolution can be as large as VRAM allows for (4K/8K/16K and above)
 - (interacitive) visualization modes:
   - flags (and force vectors on solid boundary cells if the extension is used)
   - velocity field
@@ -254,6 +256,24 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 
 
 
+## Solving the Compatibility Problem
+
+- FluidX3D is written in OpenCL 1.2, so it runs on any hardware from any vendor (Nvidia, AMD, Intel, ...):
+  - world's fastest datacenter GPUs, like H100, A100, MI250(X), MI210, MI100, V100(S), P100, ...
+  - gaming GPUs (desktop or laptop), like Nvidia GeForce, AMD Radeon, Intel Arc
+  - professional/workstation GPUs, like Nvidia Quadro, AMD Radeon Pro / FirePro
+  - integrated GPUs
+  - Intel Xeon Phi (requires installation of the [Intel OpenCL CPU Runtime ("oclcpuexp")](https://github.com/intel/llvm/releases?q=oneAPI+DPC%2B%2B+Compiler))
+  - Intel/AMD CPUs (requires installation of the [Intel OpenCL CPU Runtime ("oclcpuexp")](https://github.com/intel/llvm/releases?q=oneAPI+DPC%2B%2B+Compiler))
+  - even smartphone ARM GPUs
+- supports parallelization across multiple GPUs on a single PC/laptop/server with PCIe communication, no SLI/Crossfire/NVLink/InfinityFabric or MPI installation required; the GPUs don't even have to be from the same vendor, but similar memory capacity and bandwidth is recommended
+- works in Windows and Linux with C++17, with limited support also for MacOS and Android
+- supports importing and voxelizing triangle meshes from binary `.stl` files, with fast GPU voxelization
+- supports exporting volumetric data as binary `.vtk` files with `lbm.<field>.write_device_to_vtk();`
+- supports exporting rendered frames as `.png`/`.qoi`/`.bmp` files with `lbm.graphics.write_frame();`, encoding is handled in parallel on the CPU while the simulation on GPU can continue without delay
+
+
+
 ## How to get started?
 
 1. Check the settings and extensions in [`src/defines.hpp`](src/defines.hpp) by uncommenting corresponding lines.
@@ -289,24 +309,6 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 
 
 
-## Compatibility
-
-- works in Windows, Linux and Android with C++17
-- runs on any hardware that supports OpenCL 1.2, from any vendor (Nvidia, AMD, Intel, ...):
-  - world's fastest datacenter GPUs like H100, A100, MI250(X), MI210, MI100, V100(S), P100, ...
-  - gaming GPUs (desktop or laptop)
-  - "professional"/workstation GPUs
-  - integrated GPUs
-  - Xeon Phi
-  - CPUs
-  - even smartphone ARM GPUs
-- supports parallelization across multiple GPUs on a single node (PC/laptop/server) with PCIe communication, no SLI/Crossfire/NVLink/InfinityFabric or MPI installation required; the GPUs don't even have to be from the same vendor, but similar memory capacity and bandwidth is recommended
-- supports importing and voxelizing triangle meshes from binary `.stl` files, with fast GPU voxelization
-- supports exporting volumetric data as binary `.vtk` files
-- supports exporting rendered frames as `.png`/`.qoi`/`.bmp` files; time-consuming image encoding is handled in parallel on the CPU while the simulation on GPU can continue without delay
-
-
-
 ## Single-GPU Benchmarks
 
 Here are [performance benchmarks](https://doi.org/10.3390/computation10060092) on various hardware in MLUPs/s, or how many million lattice points are updated per second. The settings used for the benchmark are D3Q19 SRT with no extensions enabled (only LBM with implicit mid-grid bounce-back boundaries) and the setup consists of an empty cubic box with sufficient size (typically 256³). Without extensions, a single lattice point requires:
@@ -329,7 +331,8 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🟢&nbsp;A100&nbsp;SXM4&nbsp;80GB                |              19.49 |          80 |         2039 |       10228&nbsp;(77%) |             18448 (70%) |             11197 (42%) |
 | 🟢&nbsp;A100&nbsp;SXM4&nbsp;40GB                |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
 | 🟢&nbsp;A100&nbsp;PCIe&nbsp;40GB                |              19.49 |          40 |         1555 |             8526 (84%) |             16035 (79%) |             11088 (55%) |
-| 🟢&nbsp;Tesla&nbsp;V100&nbsp;16GB               |              14.13 |          16 |          900 |             5128 (87%) |             10325 (88%) |              7683 (66%) |
+| 🟢&nbsp;Tesla&nbsp;V100&nbsp;SXM2&nbsp;32GB     |              15.67 |          32 |          900 |             4471 (76%) |              8947 (77%) |              7217 (62%) |
+| 🟢&nbsp;Tesla&nbsp;V100&nbsp;PCIe&nbsp;16GB     |              14.13 |          16 |          900 |             5128 (87%) |             10325 (88%) |              7683 (66%) |
 | 🟢&nbsp;Quadro&nbsp;GV100                       |              16.66 |          32 |          870 |             3442 (61%) |              6641 (59%) |              5863 (52%) |
 | 🟢&nbsp;Titan&nbsp;V                            |              14.90 |          12 |          653 |             3601 (84%) |              7253 (86%) |              6957 (82%) |
 | 🟢&nbsp;Tesla&nbsp;P100&nbsp;16GB               |               9.52 |          16 |          732 |             3295 (69%) |              5950 (63%) |              4176 (44%) |
@@ -434,10 +437,19 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🔴&nbsp;4x&nbsp;Radeon&nbsp;VII                                 |              55.32 |          64 |         4096 |      12911&nbsp;(2.6x) |            24273 (3.1x) |            17080 (3.2x) |
 | 🔴&nbsp;8x&nbsp;Radeon&nbsp;VII                                 |             110.64 |         128 |         8192 |      21946&nbsp;(4.5x) |            30826 (4.0x) |            24572 (4.7x) |
 |                                                                 |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;1x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              19.49 |          40 |         1555 |             8543 (84%) |             15917 (79%) |              8748 (43%) |
+| 🟢&nbsp;2x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              38.98 |          80 |         3110 |      14311&nbsp;(1.7x) |            23707 (1.5x) |            15512 (1.8x) |
+| 🟢&nbsp;4x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              77.96 |         160 |         6220 |      23411&nbsp;(2.7x) |            42400 (2.7x) |            29017 (3.3x) |
+| 🟢&nbsp;8x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |             155.92 |         320 |        12440 |      37619&nbsp;(4.4x) |            72965 (4.6x) |            63009 (7.2x) |
+|                                                                 |                    |             |              |                        |                         |                         |
 | 🟢&nbsp;1x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
 | 🟢&nbsp;2x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              38.98 |          80 |         3110 |      13629&nbsp;(1.6x) |            24620 (1.5x) |            18850 (1.7x) |
 | 🟢&nbsp;4x&nbsp;A100&nbsp;SXM4&nbsp;40GB                        |              77.96 |         160 |         6220 |      17978&nbsp;(2.1x) |            30604 (1.9x) |            30627 (2.7x) |
 |                                                                 |                    |             |              |                        |                         |                         |
+| 🟢&nbsp;1x&nbsp;Tesla&nbsp;V100&nbsp;SXM2&nbsp;32GB             |              15.67 |          32 |          900 |             4471 (76%) |              8947 (77%) |              7217 (62%) |
+| 🟢&nbsp;2x&nbsp;Tesla&nbsp;V100&nbsp;SXM2&nbsp;32GB             |              31.34 |          64 |         1800 |            7953 (1.8x) |            15469 (1.7x) |            12932 (1.8x) |
+| 🟢&nbsp;4x&nbsp;Tesla&nbsp;V100&nbsp;SXM2&nbsp;32GB             |              62.68 |         128 |         3600 |      13135&nbsp;(2.9x) |            26527 (3.0x) |            22686 (3.1x) |
+|                                                                 |                    |             |              |                        |                         |                         |
 | 🟢&nbsp;1x&nbsp;Tesla&nbsp;K40m                                 |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
 | 🟢&nbsp;2x&nbsp;Tesla&nbsp;K40m                                 |               8.58 |          24 |          577 |            1971 (1.7x) |             3300 (1.8x) |             1801 (2.0x) |
 | 🟢&nbsp;3x&nbsp;K40m&nbsp;+&nbsp;1x&nbsp;Titan&nbsp;Xp          |              17.16 |          48 |         1154 |            3117 (2.8x) |             5174 (2.8x) |             3127 (3.4x) |

From cc719cee30848e8e3beb26a9488fcc70bd8e441f Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Thu, 4 May 2023 17:38:52 +0200
Subject: [PATCH 05/15] Added time_step_multiplicator for integrate_particles()
 function in PARTICLES extension, fixed bug in write_file() template functions

---
 src/kernel.cpp    |  4 ++--
 src/lbm.cpp       | 20 ++++++++++----------
 src/lbm.hpp       |  6 +++---
 src/utilities.hpp |  4 ++--
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/kernel.cpp b/src/kernel.cpp
index 5e4e0cb4..db888fb9 100644
--- a/src/kernel.cpp
+++ b/src/kernel.cpp
@@ -1922,7 +1922,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 )+"#endif"+R( // FORCE_FIELD
 
 )+"#ifdef PARTICLES"+R(
-)+R(kernel void integrate_particles)+"("+R(global float* particles, const global float* u // ) {
+)+R(kernel void integrate_particles)+"("+R(global float* particles, const global float* u, const float time_step_multiplicator // ) {
 )+"#ifdef FORCE_FIELD"+R(
 	, volatile global float* F, const float fx, const float fy, const float fz
 )+"#endif"+R( // FORCE_FIELD
@@ -1937,7 +1937,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 		spread_force(F, p0, Fn); // do force spreading
 	}
 )+"#endif"+R( // FORCE_FIELD
-	const float3 un = interpolate_u(mirror_position(p0), u); // trilinear interpolation of velocity at point p
+	const float3 un = interpolate_u(mirror_position(p0), u)*time_step_multiplicator; // trilinear interpolation of velocity at point p
 	const float3 p = mirror_position(p0+un); // advect particles
 	particles[                           n] = p.x;
 	particles[    def_particles_N+(ulong)n] = p.y;
diff --git a/src/lbm.cpp b/src/lbm.cpp
index 9040818c..ee18ed0a 100644
--- a/src/lbm.cpp
+++ b/src/lbm.cpp
@@ -102,7 +102,7 @@ void LBM_Domain::allocate(Device& device) {
 
 #ifdef PARTICLES
 	particles = Memory<float>(device, (ulong)particles_N, 3u);
-	kernel_integrate_particles = Kernel(device, (ulong)particles_N, "integrate_particles", particles, u);
+	kernel_integrate_particles = Kernel(device, (ulong)particles_N, "integrate_particles", particles, u, 1.0f);
 #ifdef FORCE_FIELD
 	kernel_integrate_particles.add_parameters(F, fx, fy, fz);
 #endif // FORCE_FIELD
@@ -150,17 +150,17 @@ void LBM_Domain::enqueue_update_moving_boundaries() { // mark/unmark nodes next
 }
 #endif // MOVING_BOUNDARIES
 #ifdef PARTICLES
-void LBM_Domain::enqueue_integrate_particles() { // intgegrate particles forward in time and couple particles to fluid
+void LBM_Domain::enqueue_integrate_particles(const uint time_step_multiplicator) { // intgegrate particles forward in time and couple particles to fluid
 #ifdef FORCE_FIELD
 	if(particles_rho!=1.0f) kernel_reset_force_field.enqueue_run(); // only reset force field if particles have buoyancy and apply forces on fluid
-	kernel_integrate_particles.set_parameters(3u, fx, fy, fz);
+	kernel_integrate_particles.set_parameters(4u, fx, fy, fz);
 #endif // FORCE_FIELD
-	kernel_integrate_particles.enqueue_run();
+	kernel_integrate_particles.set_parameters(2u, (float)time_step_multiplicator).enqueue_run();
 }
 #endif // PARTICLES
 
-void LBM_Domain::increment_time_step() {
-	t++; // increment time step
+void LBM_Domain::increment_time_step(const uint steps) {
+	t += (ulong)steps; // increment time step
 #ifdef UPDATE_FIELDS
 	t_last_update_fields = t;
 #endif // UPDATE_FIELDS
@@ -830,18 +830,18 @@ void LBM::update_moving_boundaries() { // mark/unmark nodes next to TYPE_S nodes
 #endif // MOVING_BOUNDARIES
 
 #if defined(PARTICLES)&&!defined(FORCE_FIELD)
-void LBM::integrate_particles(const ulong steps) { // intgegrate passive tracer particles forward in time in stationary flow field
+void LBM::integrate_particles(const ulong steps, const uint time_step_multiplicator) { // intgegrate passive tracer particles forward in time in stationary flow field
 	info.append(steps, get_t());
 	Clock clock;
-	for(ulong i=1ull; i<=steps; i++) {
+	for(ulong i=1ull; i<=steps; i+=(ulong)time_step_multiplicator) {
 #if defined(INTERACTIVE_GRAPHICS)||defined(INTERACTIVE_GRAPHICS_ASCII)
 		while(!key_P&&running) sleep(0.016);
 		if(!running) break;
 #endif // INTERACTIVE_GRAPHICS_ASCII || INTERACTIVE_GRAPHICS
 		clock.start();
-		for(uint d=0u; d<get_D(); d++) lbm[d]->enqueue_integrate_particles();
+		for(uint d=0u; d<get_D(); d++) lbm[d]->enqueue_integrate_particles(time_step_multiplicator);
 		for(uint d=0u; d<get_D(); d++) lbm[d]->finish_queue();
-		for(uint d=0u; d<get_D(); d++) lbm[d]->increment_time_step();
+		for(uint d=0u; d<get_D(); d++) lbm[d]->increment_time_step(time_step_multiplicator);
 		info.update(clock.stop());
 	}
 }
diff --git a/src/lbm.hpp b/src/lbm.hpp
index d6c2ab01..7d2f6590 100644
--- a/src/lbm.hpp
+++ b/src/lbm.hpp
@@ -99,10 +99,10 @@ class LBM_Domain {
 	void enqueue_update_moving_boundaries(); // mark/unmark nodes next to TYPE_S nodes with velocity!=0 with TYPE_MS
 #endif // MOVING_BOUNDARIES
 #ifdef PARTICLES
-	void enqueue_integrate_particles(); // intgegrates particles forward in time and couples particles to fluid
+	void enqueue_integrate_particles(const uint time_step_multiplicator=1u); // intgegrates particles forward in time and couples particles to fluid
 #endif // PARTICLES
 
-	void increment_time_step(); // increment time step
+	void increment_time_step(const uint steps=1u); // increment time step
 	void reset_time_step(); // reset time step
 	void finish_queue();
 
@@ -387,7 +387,7 @@ class LBM {
 	void update_moving_boundaries(); // mark/unmark nodes next to TYPE_S nodes with velocity!=0 with TYPE_MS
 #endif // MOVING_BOUNDARIES
 #if defined(PARTICLES)&&!defined(FORCE_FIELD)
-	void integrate_particles(const ulong steps=max_ulong); // intgegrate passive tracer particles forward in time in stationary flow field
+	void integrate_particles(const ulong steps=max_ulong, const uint time_step_multiplicator=1u); // intgegrate passive tracer particles forward in time in stationary flow field
 #endif // PARTICLES&&!FORCE_FIELD
 
 	uint get_Nx() const { return Nx; } // get (global) lattice dimensions in x-direction
diff --git a/src/utilities.hpp b/src/utilities.hpp
index f8ca8ec1..20b8b829 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -3927,13 +3927,13 @@ inline void write_line(const string& filename, const string& content="") {
 	file.write(s.c_str(), s.length());
 	file.close();
 }
-template<typename T> inline void write_file(const string& filename, const uint n, const T* y, const string& header="") {
+template<typename T> inline void write_file(const string& filename, const string& header, const uint n, const T* y) {
 	string s = header;
 	if(length(s)>0u && !ends_with(s, "\n")) s += "\n";
 	for(uint i=0u; i<n; i++) s += to_string(i)+"\t"+to_string(y[i])+"\n";
 	write_file(filename, s);
 }
-template<typename T, typename U> inline void write_file(const string& filename, const uint n, const T* x, const U* y, const string& header="") {
+template<typename T, typename U> inline void write_file(const string& filename, const string& header, const uint n, const T* x, const U* y) {
 	string s = header;
 	if(length(s)>0u && !ends_with(s, "\n")) s += "\n";
 	for(uint i=0u; i<n; i++) s += to_string(x[i])+"\t"+to_string(y[i])+"\n";

From 08649b4b491947393954ff0f37282697c621b36a Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Thu, 4 May 2023 18:17:38 +0200
Subject: [PATCH 06/15] Reverted back to separate cl::Context for each OpenCL
 device, as the shared Context otherwise would allocate extra VRAM on all
 other unused Nvidia GPUs

---
 src/opencl.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/opencl.hpp b/src/opencl.hpp
index 23d572d3..1008e525 100644
--- a/src/opencl.hpp
+++ b/src/opencl.hpp
@@ -98,8 +98,9 @@ inline vector<Device_Info> get_devices(const bool print_info=true) { // returns
 	for(uint i=0u; i<(uint)cl_platforms.size(); i++) {
 		vector<cl::Device> cl_devices;
 		cl_platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &cl_devices);
-		cl::Context cl_context(cl_devices);
+		//cl::Context cl_context(cl_devices); // same cl::Context for all devices (allocates extra VRAM on all other unused Nvidia GPUs)
 		for(uint j=0u; j<(uint)cl_devices.size(); j++) {
+			cl::Context cl_context(cl_devices[j]); // separate cl::Context for each device
 			devices.push_back(Device_Info(cl_devices[j], cl_context, id++));
 		}
 	}

From e6e561f1ec6fd46427cbdd367dbd0195473d3f35 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Sun, 14 May 2023 08:36:45 +0200
Subject: [PATCH 07/15] Small name change ;)

---
 LICENSE.md | 2 +-
 README.md  | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/LICENSE.md b/LICENSE.md
index 134ec80c..df017491 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,4 +1,4 @@
-Copyright (c) 2022-2023 Moritz Lehmann
+Copyright (c) 2022-2023 Dr. Moritz Lehmann
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files, to use this software for public research, education or personal use, and to alter it and redistribute it freely, subject to the following restrictions:
 
diff --git a/README.md b/README.md
index b16526a7..6f0f0d90 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 The fastest and most memory efficient lattice Boltzmann CFD software, running on all GPUs via [OpenCL](https://github.com/ProjectPhysX/OpenCL-Wrapper "OpenCL-Wrapper").
 
-<a href="https://youtu.be/o3TPN142HxM"><img src="https://img.youtube.com/vi/o3TPN142HxM/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/oC6U1M0Fsug"><img src="https://img.youtube.com/vi/oC6U1M0Fsug/maxresdefault.jpg" width="50%"></img></a><br>
+<a href="https://youtu.be/-MkRBeQkLk8"><img src="https://img.youtube.com/vi/o3TPN142HxM/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/oC6U1M0Fsug"><img src="https://img.youtube.com/vi/oC6U1M0Fsug/maxresdefault.jpg" width="50%"></img></a><br>
 <a href="https://youtu.be/XOfXHgP4jnQ"><img src="https://img.youtube.com/vi/XOfXHgP4jnQ/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/3JNVBQyetMA"><img src="https://img.youtube.com/vi/3JNVBQyetMA/maxresdefault.jpg" width="50%"></img></a>
 
 
@@ -539,6 +539,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 
 ## References
 
+- Lehmann, M.: [Computational study of microplastic transport at the water-air interface with a memory-optimized lattice Boltzmann method](https://doi.org/10.15495/EPub_UBT_00006977). PhD thesis, (2023)
 - Lehmann, M.: [Esoteric Pull and Esoteric Push: Two Simple In-Place Streaming Schemes for the Lattice Boltzmann Method on GPUs](https://doi.org/10.3390/computation10060092). Computation, 10, 92, (2022)
 - Lehmann, M., Krause, M., Amati, G., Sega, M., Harting, J. and Gekle, S.: [Accuracy and performance of the lattice Boltzmann method with 64-bit, 32-bit, and customized 16-bit number formats](https://www.researchgate.net/publication/362275548_Accuracy_and_performance_of_the_lattice_Boltzmann_method_with_64-bit_32-bit_and_customized_16-bit_number_formats). Phys. Rev. E 106, 015308, (2022)
 - Lehmann, M.: [Combined scientific CFD simulation and interactive raytracing with OpenCL](https://www.researchgate.net/publication/360501260_Combined_scientific_CFD_simulation_and_interactive_raytracing_with_OpenCL). IWOCL'22: International Workshop on OpenCL, 3, 1-2, (2022)
@@ -550,5 +551,6 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 
 ## Contact
 
-- For any questions, feedback or other inquiries, don't hesitate to contact me at [moritz.lehmann@uni-bayreuth.de](mailto:moritz.lehmann@uni-bayreuth.de?subject=FluidX3D).
+- FluidX3D is solo-developed and maintained by Dr. Moritz Lehmann.
+- For any questions, feedback or other inquiries, contact me at [moritz.lehmann@uni-bayreuth.de](mailto:moritz.lehmann@uni-bayreuth.de?subject=FluidX3D).
 - Updates will be posted on Twitter via [@FluidX3D](https://twitter.com/FluidX3D) and [@ProjectPhysX](https://twitter.com/ProjectPhysX), under the hashtag [#FluidX3D](https://twitter.com/hashtag/FluidX3D?src=hashtag_click&f=live) or on my [YouTube channel](https://www.youtube.com/c/ProjectPhysX).
\ No newline at end of file

From 6292a90efe9be634cfc1cc12db633d5aaf56c15a Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Wed, 17 May 2023 11:06:05 +0200
Subject: [PATCH 08/15] Added more benchmarks in Readme

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6f0f0d90..b0a61e5d 100644
--- a/README.md
+++ b/README.md
@@ -345,6 +345,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;7900&nbsp;XT        |              51.61 |          20 |          800 |             3013 (58%) |              5856 (56%) |              5986 (58%) |
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;6900&nbsp;XT        |              23.04 |          16 |          512 |             1968 (59%) |              4227 (64%) |              4207 (63%) |
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;6800&nbsp;XT        |              20.74 |          16 |          512 |             2008 (60%) |              4241 (64%) |              4224 (64%) |
+| 🔴&nbsp;Radeon&nbsp;RX&nbsp;6700M               |              10.60 |          10 |          320 |             1194 (57%) |              2388 (57%) |              2429 (58%) |
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;5700&nbsp;XT        |               9.75 |           8 |          448 |             1368 (47%) |              3253 (56%) |              3049 (52%) |
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;Vega&nbsp;64        |              13.35 |           8 |          484 |             1875 (59%) |              2878 (46%) |              3227 (51%) |
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;580&nbsp;4GB        |               6.50 |           4 |          256 |              946 (57%) |              1848 (56%) |              1577 (47%) |

From 593619e472135b12423624c01972486f718f8df5 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Thu, 18 May 2023 07:52:42 +0200
Subject: [PATCH 09/15] Added more benchmarks in Readme

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b0a61e5d..3f966bc1 100644
--- a/README.md
+++ b/README.md
@@ -283,7 +283,7 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
    - Set the initial condition in a loop that iterates over the entire lattice by writing to `lbm.rho[n]`/`lbm.u.x[n]`/`lbm.u.y[n]`/`lbm.u.z[n]`/`lbm.flags[n]`.
    - Call `lbm.run();` to initialize and execute the setup (infinite time steps) or `lbm.run(time_steps);` to execute only a specific number of time steps.
    - As long as the `lbm` object is in scope, you can access the memory. As soon as it goes out of scope, all memory associated to the current simulation is freed again.
-3. When done with the setup, on Windows in Visual Studio Community select "Release" and "x64" and hit compile+run, or on Linux execute `chmod +x make.sh` and `./make.sh`; this will automatically select the fastest installed GPU(s). Alternatively, you can add the device ID(s) as command-line arguments, for example `./make.sh 2` to compile+run on device 2, or `bin/FluidX3D 1 3` to run the executable on devices 1 and 3. Compile time for the entire code is about 10 seconds. If you use `INTERACTIVE_GRAPHICS` on Linux, change to the "compile on Linux with X11" command in `make.sh`.
+3. On Windows in Visual Studio Community select `Release` and `x64` and click compile+run, or on Linux run `chmod +x make.sh` and `./make.sh`; this will automatically select the fastest installed GPU(s). Alternatively, you can add the device ID(s) as command-line arguments, for example `./make.sh 2` to compile+run on device 2, or `bin/FluidX3D 1 3` to run the executable on devices 1 and 3. Compile time for the entire code is about 10 seconds. If you use `INTERACTIVE_GRAPHICS` on Linux, change to the "compile on Linux with X11" command in `make.sh`.
 4. Keyboard/mouse controls with `INTERACTIVE_GRAPHICS`/`INTERACTIVE_GRAPHICS_ASCII` enabled:
    - <kbd>P</kbd>: start/pause the simulation
    - <kbd>H</kbd>: show/hide help
@@ -349,6 +349,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;5700&nbsp;XT        |               9.75 |           8 |          448 |             1368 (47%) |              3253 (56%) |              3049 (52%) |
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;Vega&nbsp;64        |              13.35 |           8 |          484 |             1875 (59%) |              2878 (46%) |              3227 (51%) |
 | 🔴&nbsp;Radeon&nbsp;RX&nbsp;580&nbsp;4GB        |               6.50 |           4 |          256 |              946 (57%) |              1848 (56%) |              1577 (47%) |
+| 🔴&nbsp;Radeon&nbsp;R9&nbsp;390X                |               5.91 |           8 |          384 |             1733 (69%) |              2217 (44%) |              1722 (35%) |
 | 🔴&nbsp;Radeon&nbsp;HD&nbsp;7850                |               1.84 |           2 |          154 |              112 (11%) |               120 ( 6%) |               635 (32%) |
 | 🔵&nbsp;Arc&nbsp;A770&nbsp;LE                   |              19.66 |          16 |          560 |             2741 (75%) |              4591 (63%) |              4626 (64%) |
 | 🔵&nbsp;Arc&nbsp;A750&nbsp;LE                   |              17.20 |           8 |          512 |             2625 (78%) |              4184 (63%) |              4238 (64%) |

From bddff57a145ad8cdf63db47028f7f1f16d7fddc8 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Fri, 19 May 2023 19:38:00 +0200
Subject: [PATCH 10/15] Minor cosmetics

---
 src/graphics.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/graphics.hpp b/src/graphics.hpp
index f57cef69..77c6529b 100644
--- a/src/graphics.hpp
+++ b/src/graphics.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
 #define WINDOW_NAME "FluidX3D"
-//#define CONSOLE_GRAPHICS
-//#define WINDOWS_GRAPHICS
+//#define INTERACTIVE_GRAPHICS
+//#define INTERACTIVE_GRAPHICS_ASCII
 //#define GRAPHICS
 
 #include "defines.hpp"

From ea520f6d1670fefcf6dc573832db16227104adf0 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Sat, 20 May 2023 07:08:38 +0200
Subject: [PATCH 11/15] Minor cosmetics: fixed inconsistent uint/int datatype
 for colors

---
 src/kernel.cpp | 26 +++++++++++++-------------
 src/setup.cpp  |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/kernel.cpp b/src/kernel.cpp
index db888fb9..03c65f68 100644
--- a/src/kernel.cpp
+++ b/src/kernel.cpp
@@ -599,7 +599,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 	camray.direction = p1-p0;
 	return camray;
 }
-)+R(uint skybox_bottom(const ray r, const int c1, const int c2, const uint skybox_color) {
+)+R(int skybox_bottom(const ray r, const int c1, const int c2, const int skybox_color) {
 	const float3 p0=(float3)(0.0f, 0.0f, -0.5f*(float)def_Nz), p1=(float3)(1.0f, 0.0f, -0.5f*(float)def_Nz), p2=(float3)(0.0f, 1.0f, -0.5f*(float)def_Nz);
 	const float distance = intersect_plane(r, p0, p1, p2);
 	if(distance>0.0f) { // ray intersects with bottom
@@ -615,25 +615,25 @@ string opencl_c_container() { return R( // ########################## begin of O
 		return skybox_color;
 	}
 }
-)+R(uint skybox_color_bw(const float x, const float y) {
+)+R(int skybox_color_bw(const float x, const float y) {
 	return color_dim(0xFFFFFF, 1.0f-y);
 }
-)+R(uint skybox_color_hsv(const float x, const float y) {
+)+R(int skybox_color_hsv(const float x, const float y) {
 	const float h = fmod(x*360.0f+120.0f, 360.0f);
 	const float s = y>0.5f ? 1.0f : 2.0f*y;
 	const float v = y>0.5f ? 2.0f-2.0f*y : 1.0f;
 	return hsv_to_rgb(h, s, v);
 }
-)+R(uint skybox_color_sunset(const float x, const float y) {
+)+R(int skybox_color_sunset(const float x, const float y) {
 	return color_mix(255<<16|175<<8|55, y<0.5f ? 55<<16|111<<8|255 : 0, 2.0f*(0.5f-fabs(y-0.5f)));
 }
-)+R(uint skybox_color_grid(const float x, const float y, const int c1, const int c2) {
+)+R(int skybox_color_grid(const float x, const float y, const int c1, const int c2) {
 	int a = (int)(72.0f*x);
 	int b = (int)(36.0f*y);
 	const int w = (a%2==b%2);
 	return w*c1+(1-w)*c2;
 }
-)+R(uint skybox_color(const ray r, const global int* skybox) {
+)+R(int skybox_color(const ray r, const global int* skybox) {
 	const float3 direction = normalize(r.direction); // to avoid artifacts from asin(direction.z)
 	//const float x = fma(atan2(direction.x, direction.y),  0.5f/3.1415927f, 0.5f);
 	//const float y = fma(asin (direction.z             ), -1.0f/3.1415927f, 0.5f);
@@ -643,11 +643,11 @@ string opencl_c_container() { return R( // ########################## begin of O
 	const float fu = (float)def_skybox_width *fma(atan2(direction.x, direction.y),  0.5f/3.1415927f, 0.5f);
 	const float fv = (float)def_skybox_height*fma(asin (direction.z             ), -1.0f/3.1415927f, 0.5f);
 	const int ua=clamp((int)fu, 0, (int)def_skybox_width-1), va=clamp((int)fv, 0, (int)def_skybox_height-1), ub=(ua+1)%def_skybox_width, vb=min(va+1, (int)def_skybox_height-1); // bilinear interpolation positions
-	const uint s00=skybox[ua+va*def_skybox_width], s01=skybox[ua+vb*def_skybox_width], s10=skybox[ub+va*def_skybox_width], s11=skybox[ub+vb*def_skybox_width];
+	const int s00=skybox[ua+va*def_skybox_width], s01=skybox[ua+vb*def_skybox_width], s10=skybox[ub+va*def_skybox_width], s11=skybox[ub+vb*def_skybox_width];
 	const float u1=fu-(float)ua, v1=fv-(float)va, u0=1.0f-u1, v0=1.0f-v1; // interpolation factors
 	return color_mix(color_mix(s00, s01, v0), color_mix(s10, s11, v0), u0); // perform bilinear interpolation
 }
-)+R(uint last_ray_reflectivity(const ray reflection, const ray transmission, const float reflectivity, const float transmissivity, const global int* skybox) {
+)+R(int last_ray(const ray reflection, const ray transmission, const float reflectivity, const float transmissivity, const global int* skybox) {
 	return color_mix(skybox_color(reflection, skybox), color_mix(skybox_color(transmission, skybox), def_absorption_color, transmissivity), reflectivity);
 }
 )+R(float ray_grid_traverse(const ray r, const global float* phi, const global uchar* flags, float3* normal, const uint Nx, const uint Ny, const uint Nz) {
@@ -1217,7 +1217,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 	};
 	return c[i];
 }
-)+R(float curvature_calculation(const uint n, const float* phit, const global float* phi) { // calculate surface curvature, always use D3Q27 stencil here, source: https://doi.org/10.3390/computation10020021
+)+R(float calculate_curvature(const uint n, const float* phit, const global float* phi) { // calculate surface curvature, always use D3Q27 stencil here, source: https://doi.org/10.3390/computation10020021
 )+"#ifndef D2Q9"+R(
 	float phij[27];
 	get_remaining_neighbor_phij(n, phit, phi, phij); // complete neighborhood from whatever velocity set is selected to D3Q27
@@ -1685,7 +1685,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 		uyn = clamp(uyn, -def_c, def_c);
 		uzn = clamp(uzn, -def_c, def_c);
 		phij[0] = calculate_phi(rhon, massn, flagsn); // don't load phi[n] from memory, instead recalculate it with mass corrected by excess mass
-		rho_laplace = def_6_sigma==0.0f ? 0.0f : def_6_sigma*curvature_calculation(n, phij, phi); // surface tension least squares fit (PLIC, most accurate)
+		rho_laplace = def_6_sigma==0.0f ? 0.0f : def_6_sigma*calculate_curvature(n, phij, phi); // surface tension least squares fit (PLIC, most accurate)
 		float feg[def_velocity_set]; // reconstruct f from neighbor gas lattice points
 		const float rho2tmp = 0.5f/rhon; // apply external volume force (Guo forcing, Krueger p.233f)
 		const float uxntmp = clamp(fma(fx, rho2tmp, uxn), -def_c, def_c); // limit velocity (for stability purposes)
@@ -2610,12 +2610,12 @@ string opencl_c_container() { return R( // ########################## begin of O
 	ray reflection_next, transmission_next;
 	float reflection_reflectivity, reflection_transmissivity, transmission_reflectivity, transmission_transmissivity;
 	if(raytrace_phi(reflection, &reflection_next, &transmission_next, &reflection_reflectivity, &reflection_transmissivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
-		color_reflect = last_ray_reflectivity(reflection_next, transmission_next, reflection_reflectivity, reflection_transmissivity, skybox);
+		color_reflect = last_ray(reflection_next, transmission_next, reflection_reflectivity, reflection_transmissivity, skybox);
 	} else {
 		color_reflect = skybox_color(reflection, skybox);
 	}
 	if(raytrace_phi(transmission, &reflection_next, &transmission_next, &transmission_reflectivity, &transmission_transmissivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
-		color_transmit = last_ray_reflectivity(reflection_next, transmission_next, transmission_reflectivity, transmission_transmissivity, skybox);
+		color_transmit = last_ray(reflection_next, transmission_next, transmission_reflectivity, transmission_transmissivity, skybox);
 	} else {
 		color_transmit = skybox_color(transmission, skybox);
 	}
@@ -2650,7 +2650,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 	float reflectivity, transmissivity;
 	int pixelcolor = 0;
 	if(raytrace_phi(camray, &reflection, &transmission, &reflectivity, &transmissivity, phi, flags, skybox, def_Nx, def_Ny, def_Nz)) {
-		pixelcolor = last_ray_reflectivity(reflection, transmission, reflectivity, transmissivity, skybox); // 1 ray pass
+		pixelcolor = last_ray(reflection, transmission, reflectivity, transmissivity, skybox); // 1 ray pass
 		//pixelcolor = raytrace_phi_next_ray(reflection, transmission, reflectivity, transmissivity, phi, flags, skybox); // 2 ray passes
 	} else {
 		pixelcolor = skybox_color(camray, skybox);
diff --git a/src/setup.cpp b/src/setup.cpp
index 57446cf9..67d123e3 100644
--- a/src/setup.cpp
+++ b/src/setup.cpp
@@ -686,7 +686,7 @@
 
 /*void main_setup() { // hydraulic jump
 	// ######################################################### define simulation box size, viscosity and volume force ############################################################################
-	LBM lbm(96u, 352u, 96u, 1u, 2u, 1u, 0.007f, 0.0f, 0.0f, -0.0005f);
+	LBM lbm(96u, 352u, 96u, 1u, 1u, 1u, 0.007f, 0.0f, 0.0f, -0.0005f);
 	// #############################################################################################################################################################################################
 	const ulong N=lbm.get_N(); const uint Nx=lbm.get_Nx(), Ny=lbm.get_Ny(), Nz=lbm.get_Nz(); for(ulong n=0ull; n<N; n++) { uint x=0u, y=0u, z=0u; lbm.coordinates(n, x, y, z);
 		// ########################################################################### define geometry #############################################################################################

From c84442c84bad0c67454eb4081d1ff4b86f82e742 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Sun, 21 May 2023 10:13:47 +0200
Subject: [PATCH 12/15] Added more benchmarks in Readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 3f966bc1..16220f88 100644
--- a/README.md
+++ b/README.md
@@ -337,6 +337,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🟢&nbsp;Titan&nbsp;V                            |              14.90 |          12 |          653 |             3601 (84%) |              7253 (86%) |              6957 (82%) |
 | 🟢&nbsp;Tesla&nbsp;P100&nbsp;16GB               |               9.52 |          16 |          732 |             3295 (69%) |              5950 (63%) |              4176 (44%) |
 | 🟢&nbsp;Tesla&nbsp;P100&nbsp;12GB               |               9.52 |          12 |          549 |             2427 (68%) |              4141 (58%) |              3999 (56%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;TITAN             |               4.71 |           6 |          288 |             1460 (77%) |              2500 (67%) |              1113 (30%) |
 | 🟢&nbsp;Tesla&nbsp;K40m                         |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
 | 🟢&nbsp;Tesla&nbsp;K80&nbsp;(1&nbsp;GPU)        |               4.11 |          12 |          240 |              916 (58%) |              1642 (53%) |               943 (30%) |
 | 🟢&nbsp;Tesla&nbsp;K20c                         |               3.52 |           5 |          208 |              861 (63%) |              1507 (56%) |               720 (27%) |
@@ -392,6 +393,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🟢&nbsp;Quadro&nbsp;M4000                       |               2.57 |           8 |          192 |              899 (72%) |              1519 (61%) |              1050 (42%) |
 | 🟢&nbsp;Tesla&nbsp;M60&nbsp;(1&nbsp;GPU)        |               4.82 |           8 |          160 |              853 (82%) |              1571 (76%) |              1557 (75%) |
 | 🟢&nbsp;GeForce&nbsp;GTX&nbsp;960M              |               1.51 |           4 |           80 |              442 (84%) |               872 (84%) |               627 (60%) |
+| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;680&nbsp;4GB      |               3.33 |           4 |          192 |              783 (62%) |              1274 (51%) |               814 (33%) |
 | 🟢&nbsp;Quadro&nbsp;K2000                       |               0.73 |           2 |           64 |              312 (75%) |               444 (53%) |               171 (21%) |
 | 🟢&nbsp;GeForce&nbsp;GT&nbsp;630&nbsp;(OEM)     |               0.46 |           2 |           29 |              151 (81%) |               185 (50%) |                78 (21%) |
 | 🟢&nbsp;Quadro&nbsp;NVS&nbsp;290                |               0.03 |       0.256 |            6 |                1 ( 2%) |                 1 ( 1%) |                 1 ( 1%) |

From 1e4bf872fb1907915e3f6bc8ad706cce2c20a226 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Tue, 23 May 2023 08:50:58 +0200
Subject: [PATCH 13/15] Removed Debug and x86 configurations from Visual Studio
 solution file

---
 FluidX3D.sln | 13 ++-----------
 README.md    |  2 +-
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/FluidX3D.sln b/FluidX3D.sln
index ff5ecc53..f150aeb1 100644
--- a/FluidX3D.sln
+++ b/FluidX3D.sln
@@ -1,26 +1,17 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 15
-VisualStudioVersion = 15.0.28307.1321
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.31729.503
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FluidX3D", "FluidX3D.vcxproj", "{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|x64 = Debug|x64
-		Debug|x86 = Debug|x86
 		Release|x64 = Release|x64
-		Release|x86 = Release|x86
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Debug|x64.ActiveCfg = Debug|x64
-		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Debug|x64.Build.0 = Debug|x64
-		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Debug|x86.ActiveCfg = Debug|Win32
-		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Debug|x86.Build.0 = Debug|Win32
 		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Release|x64.ActiveCfg = Release|x64
 		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Release|x64.Build.0 = Release|x64
-		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Release|x86.ActiveCfg = Release|Win32
-		{0ED9A517-40B5-43E1-95F5-A78F6D72B1E5}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/README.md b/README.md
index 16220f88..9a9ec4db 100644
--- a/README.md
+++ b/README.md
@@ -283,7 +283,7 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
    - Set the initial condition in a loop that iterates over the entire lattice by writing to `lbm.rho[n]`/`lbm.u.x[n]`/`lbm.u.y[n]`/`lbm.u.z[n]`/`lbm.flags[n]`.
    - Call `lbm.run();` to initialize and execute the setup (infinite time steps) or `lbm.run(time_steps);` to execute only a specific number of time steps.
    - As long as the `lbm` object is in scope, you can access the memory. As soon as it goes out of scope, all memory associated to the current simulation is freed again.
-3. On Windows in Visual Studio Community select `Release` and `x64` and click compile+run, or on Linux run `chmod +x make.sh` and `./make.sh`; this will automatically select the fastest installed GPU(s). Alternatively, you can add the device ID(s) as command-line arguments, for example `./make.sh 2` to compile+run on device 2, or `bin/FluidX3D 1 3` to run the executable on devices 1 and 3. Compile time for the entire code is about 10 seconds. If you use `INTERACTIVE_GRAPHICS` on Linux, change to the "compile on Linux with X11" command in `make.sh`.
+3. On Windows in Visual Studio Community and click compile+run, or on Linux run `chmod +x make.sh` and `./make.sh`; this will automatically select the fastest installed GPU(s). Alternatively, you can add the device ID(s) as command-line arguments, for example `./make.sh 2` to compile+run on device 2, or `bin/FluidX3D 1 3` to run the executable on devices 1 and 3. Compile time for the entire code is about 10 seconds. If you use `INTERACTIVE_GRAPHICS` on Linux, change to the "compile on Linux with X11" command in `make.sh`.
 4. Keyboard/mouse controls with `INTERACTIVE_GRAPHICS`/`INTERACTIVE_GRAPHICS_ASCII` enabled:
    - <kbd>P</kbd>: start/pause the simulation
    - <kbd>H</kbd>: show/hide help

From 8e796b65dfc400208632d32624972e114e7de969 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Tue, 23 May 2023 09:29:04 +0200
Subject: [PATCH 14/15] Removed Debug and x86 configurations from Visual Studio
 solution file

---
 FluidX3D.vcxproj | 99 ------------------------------------------------
 1 file changed, 99 deletions(-)

diff --git a/FluidX3D.vcxproj b/FluidX3D.vcxproj
index 2a86af01..5c5fd5ee 100644
--- a/FluidX3D.vcxproj
+++ b/FluidX3D.vcxproj
@@ -1,18 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
     <ProjectConfiguration Include="Release|x64">
       <Configuration>Release</Configuration>
       <Platform>x64</Platform>
@@ -25,25 +13,6 @@
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
@@ -56,15 +25,6 @@
   </ImportGroup>
   <ImportGroup Label="Shared">
   </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
@@ -73,65 +33,6 @@
     <OutDir>$(SolutionDir)bin\</OutDir>
     <IntDir>$(SolutionDir)temp\</IntDir>
   </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(SolutionDir)bin\</OutDir>
-    <IntDir>$(SolutionDir)temp\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(SolutionDir)bin\</OutDir>
-    <IntDir>$(SolutionDir)temp\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(SolutionDir)bin\</OutDir>
-    <IntDir>$(SolutionDir)temp\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-      <AdditionalIncludeDirectories>$(SolutionDir)src\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>$(SolutionDir)src\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>26451;6386;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>OpenCL.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>$(SolutionDir)src\OpenCL\lib</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-      <AdditionalIncludeDirectories>$(SolutionDir)src\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>

From d6485e7e960855984a05796df5efe3af52a5f951 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <moritz.lehmann@uni-bayreuth.de>
Date: Mon, 29 May 2023 09:25:12 +0200
Subject: [PATCH 15/15] FluidX3D v2.7 update

---
 LICENSE.md      |   2 +-
 README.md       |  25 +++++++++---
 src/defines.hpp |   2 +-
 src/info.cpp    |   4 +-
 src/kernel.cpp  | 105 ++++++++++++++++++++++++++++++++----------------
 src/lbm.cpp     |  96 +++++++++++++++++++++++++++++++++++--------
 src/lbm.hpp     |  20 +++++++--
 src/main.cpp    |  32 +++++++++++----
 8 files changed, 214 insertions(+), 72 deletions(-)

diff --git a/LICENSE.md b/LICENSE.md
index df017491..416f1598 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -11,4 +11,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-German [Act on Copyright and Related Rights](https://www.gesetze-im-internet.de/englisch_urhg/englisch_urhg.html) (Urheberrechtsgesetz - UrhG) - Copyright Act of 9 September 1965 (Federal Law Gazette I, p. 1273), as last amended by Article 25 of the Act of 23 June 2021 (Federal Law Gazette I, p. 1858) - applies, in particular also [§ 97 (2) UrhG](https://www.gesetze-im-internet.de/englisch_urhg/englisch_urhg.html#p0881).
\ No newline at end of file
+German [Act on Copyright and Related Rights](https://www.gesetze-im-internet.de/englisch_urhg/englisch_urhg.html) (Urheberrechtsgesetz - UrhG) - Copyright Act of 9 September 1965 (Federal Law Gazette I, p. 1273), as last amended by Article 25 of the Act of 23 June 2021 (Federal Law Gazette I, p. 1858) - applies, in particular also [§ 97 (2) UrhG](https://www.gesetze-im-internet.de/englisch_urhg/englisch_urhg.html#p0881). The name "FluidX3D" is protected by German Werktitelschutz, [§ 5 (3) MarkenG](https://www.gesetze-im-internet.de/markeng/__5.html).
\ No newline at end of file
diff --git a/README.md b/README.md
index 9a9ec4db..33469ef8 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
 
 <a href="https://youtu.be/-MkRBeQkLk8"><img src="https://img.youtube.com/vi/o3TPN142HxM/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/oC6U1M0Fsug"><img src="https://img.youtube.com/vi/oC6U1M0Fsug/maxresdefault.jpg" width="50%"></img></a><br>
 <a href="https://youtu.be/XOfXHgP4jnQ"><img src="https://img.youtube.com/vi/XOfXHgP4jnQ/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/3JNVBQyetMA"><img src="https://img.youtube.com/vi/3JNVBQyetMA/maxresdefault.jpg" width="50%"></img></a>
+(click on images to show videos on YouTube)
 
 
 <details><summary>Update History</summary>
@@ -57,6 +58,17 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - fixed bug where moving objects during re-voxelization would leave an erroneous trail of solid grid cells behind
 - v2.6 (16.04.2023)
   - patched OpenCL issues of Intel Arc GPUs: now VRAM allocations >4GB are possible and correct VRAM capacity is reported
+- v2.7 (29.05.2023)
+  - added slice visualization (key <kbd>2</kbd> / key <kbd>3</kbd> modes, then switch through slice modes with key <kbd>T</kbd>, move slice with keys <kbd>Q</kbd>/<kbd>E</kbd>)
+  - made flag wireframe / solid surface visualization kernels toggleable with key <kbd>1</kbd>
+  - added surface pressure visualization (key <kbd>1</kbd> when `FORCE_FIELD` is enabled and `lbm.calculate_force_on_boundaries();` is called)
+  - added binary `.vtk` export function for meshes with `lbm.write_mesh_to_vtk(Mesh* mesh);`
+  - added `time_step_multiplicator` for `integrate_particles()` function in PARTICLES extension
+  - made correction of wrong memory reporting on Intel Arc more robust
+  - fixed bug in `write_file()` template functions
+  - reverted back to separate `cl::Context` for each OpenCL device, as the shared Context otherwise would allocate extra VRAM on all other unused Nvidia GPUs
+  - removed Debug and x86 configurations from Visual Studio solution file (one less complication for compiling)
+  - fixed bug that particles could get too close to walls and get stuck, or leave the fluid phase (added boundary force)
 
 </details>
 
@@ -247,9 +259,9 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 - rendering is fully multi-GPU-parallelized via seamless domain decomposition rasterization
 - with interactive graphics mode disabled, image resolution can be as large as VRAM allows for (4K/8K/16K and above)
 - (interacitive) visualization modes:
-  - flags (and force vectors on solid boundary cells if the extension is used)
-  - velocity field
-  - streamlines
+  - flag wireframe / solid surface (and force vectors on solid cells or surface pressure if the extension is used)
+  - velocity field (with slice mode)
+  - streamlines (with slice mode)
   - velocity-colored Q-criterion isosurface
   - rasterized free surface with [marching-cubes](http://paulbourke.net/geometry/polygonise/)
   - [raytraced free surface](https://www.researchgate.net/publication/360501260_Combined_scientific_CFD_simulation_and_interactive_raytracing_with_OpenCL) with fast ray-grid traversal and marching-cubes, either 1-4 rays/pixel or 1-10 rays/pixel
@@ -283,17 +295,19 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
    - Set the initial condition in a loop that iterates over the entire lattice by writing to `lbm.rho[n]`/`lbm.u.x[n]`/`lbm.u.y[n]`/`lbm.u.z[n]`/`lbm.flags[n]`.
    - Call `lbm.run();` to initialize and execute the setup (infinite time steps) or `lbm.run(time_steps);` to execute only a specific number of time steps.
    - As long as the `lbm` object is in scope, you can access the memory. As soon as it goes out of scope, all memory associated to the current simulation is freed again.
-3. On Windows in Visual Studio Community and click compile+run, or on Linux run `chmod +x make.sh` and `./make.sh`; this will automatically select the fastest installed GPU(s). Alternatively, you can add the device ID(s) as command-line arguments, for example `./make.sh 2` to compile+run on device 2, or `bin/FluidX3D 1 3` to run the executable on devices 1 and 3. Compile time for the entire code is about 10 seconds. If you use `INTERACTIVE_GRAPHICS` on Linux, change to the "compile on Linux with X11" command in `make.sh`.
+3. On Windows in Visual Studio Community click compile+run, or on Linux run `chmod +x make.sh` and `./make.sh`; this will automatically select the fastest installed GPU(s). Alternatively, you can add the device ID(s) as command-line arguments, for example `./make.sh 2` to compile+run on device 2, or `bin/FluidX3D 1 3` to run the executable on devices 1 and 3. Compile time for the entire code is about 10 seconds. If you use `INTERACTIVE_GRAPHICS` on Linux, change to the "compile on Linux with X11" command in `make.sh`.
 4. Keyboard/mouse controls with `INTERACTIVE_GRAPHICS`/`INTERACTIVE_GRAPHICS_ASCII` enabled:
    - <kbd>P</kbd>: start/pause the simulation
    - <kbd>H</kbd>: show/hide help
-   - <kbd>1</kbd>: flags (and force vectors on solid boundary cells if the extension is used)
+   - <kbd>1</kbd>: flag wireframe / solid surface (and force vectors on solid cells or surface pressure if the extension is used)
    - <kbd>2</kbd>: velocity field
    - <kbd>3</kbd>: streamlines
    - <kbd>4</kbd>: vorticity / velocity-colored Q-criterion isosurface
    - <kbd>5</kbd>: rasterized free surface
    - <kbd>6</kbd>: raytraced free surface
    - <kbd>7</kbd>: particles
+   - <kbd>T</kbd>: toggle slice visualization mode
+   - <kbd>Q</kbd>/<kbd>E</kbd>: move slice in slice visualization mode
    - <kbd>Mouse</kbd> or <kbd>I</kbd>/<kbd>J</kbd>/<kbd>K</kbd>/<kbd>L</kbd>: rotate camera
    - <kbd>Scrollwheel</kbd> or <kbd>+</kbd>/<kbd>-</kbd>: zoom (centered camera mode) or camera movement speed (free camera mode)
    - <kbd>Mouseclick</kbd> or <kbd>U</kbd>: toggle rotation with <kbd>Mouse</kbd> and angle snap rotation with <kbd>I</kbd>/<kbd>J</kbd>/<kbd>K</kbd>/<kbd>L</kbd>
@@ -398,6 +412,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, 🟣 Apple, 🟡 Samsung
 | 🟢&nbsp;GeForce&nbsp;GT&nbsp;630&nbsp;(OEM)     |               0.46 |           2 |           29 |              151 (81%) |               185 (50%) |                78 (21%) |
 | 🟢&nbsp;Quadro&nbsp;NVS&nbsp;290                |               0.03 |       0.256 |            6 |                1 ( 2%) |                 1 ( 1%) |                 1 ( 1%) |
 |                                                 |                    |             |              |                        |                         |                         |
+| 🟣&nbsp;M2&nbsp;Max&nbsp;GPU&nbsp;38C&nbsp;32GB |               9.73 |          22 |          400 |             2405 (92%) |              4641 (89%) |              2444 (47%) |
 | 🟣&nbsp;M1&nbsp;Max&nbsp;GPU&nbsp;24C&nbsp;32GB |               6.14 |          22 |          400 |             2369 (91%) |              4496 (87%) |              2777 (53%) |
 | 🟣&nbsp;M1&nbsp;Pro&nbsp;GPU&nbsp;16C&nbsp;16GB |               4.10 |          11 |          200 |             1204 (92%) |              2329 (90%) |              1855 (71%) |
 | 🔴&nbsp;Radeon&nbsp;Vega&nbsp;8&nbsp;(4750G)    |               2.15 |          27 |           57 |              263 (71%) |               511 (70%) |               501 (68%) |
diff --git a/src/defines.hpp b/src/defines.hpp
index ded587f7..8a4464d7 100644
--- a/src/defines.hpp
+++ b/src/defines.hpp
@@ -33,7 +33,7 @@
 #define GRAPHICS_BACKGROUND_COLOR 0x000000 // set background color; black background (default) = 0x000000, white background = 0xFFFFFF
 #define GRAPHICS_U_MAX 0.2f // maximum velocity for velocity coloring in units of LBM lattice speed of sound (c=1/sqrt(3)) (default: 0.2f)
 #define GRAPHICS_Q_CRITERION 0.0001f // Q-criterion value for Q-criterion isosurface visualization (default: 0.0001f)
-#define GRAPHICS_BOUNDARY_FORCE_SCALE 100.0f // scaling factor for visualization of forces on solid boundaries if VOLUME_FORCE is enabled and lbm.calculate_force_on_boundaries(); is called (default: 100.0f)
+#define GRAPHICS_F_MAX 0.002f // maximum force in LBM units for visualization of forces on solid boundaries if VOLUME_FORCE is enabled and lbm.calculate_force_on_boundaries(); is called (default: 0.002f)
 #define GRAPHICS_STREAMLINE_SPARSE 4 // set how many streamlines there are every x lattice points
 #define GRAPHICS_STREAMLINE_LENGTH 128 // set maximum length of streamlines
 #define GRAPHICS_RAYTRACING_TRANSMITTANCE 0.25f // transmitted light fraction in raytracing graphics ("0.25f" = 1/4 of light is transmitted and 3/4 is absorbed along longest box side length, "1.0f" = no absorption)
diff --git a/src/info.cpp b/src/info.cpp
index 1d408ad0..3876f0b2 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -67,8 +67,8 @@ void Info::print_logo() const {
 	print("|                                  ");                print("\\  \\ /  /", c);                 print("                                  |\n");
 	print("|                                   ");                print("\\  '  /", c);                  print("                                   |\n");
 	print("|                                    ");                print("\\   /", c);                  print("                                    |\n");
-	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.6 |\n");
-	print("|                                      ");                 print("'", c);                  print("         Copyright (c) Moritz Lehmann |\n");
+	print("|                                     ");                print("\\ /", c);                  print("                FluidX3D Version 2.7 |\n");
+	print("|                                      ");                 print("'", c);                  print("     Copyright (c) Dr. Moritz Lehmann |\n");
 }
 void Info::print_initialize() {
 	const float Re = lbm->get_Re_max();
diff --git a/src/kernel.cpp b/src/kernel.cpp
index 03c65f68..450f6c00 100644
--- a/src/kernel.cpp
+++ b/src/kernel.cpp
@@ -165,7 +165,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 	else if(h<360.0f) { r = c; b = x; }
 	return (int)((r+m)*255.0f)<<16|(int)((g+m)*255.0f)<<8|(int)((b+m)*255.0f);
 }
-)+R(int lighting(const int c, const float3 p, const float3 normal, const float* camera_cache) { // calculate lighting of triangle
+)+R(int shading(const int c, const float3 p, const float3 normal, const float* camera_cache) { // calculate shading of triangle
 	const float dis  = camera_cache[ 1]; // fetch camera parameters (rotation matrix, camera position, etc.)
 	const float posx = camera_cache[ 2]-def_domain_offset_x;
 	const float posy = camera_cache[ 3]-def_domain_offset_y;
@@ -1922,7 +1922,26 @@ string opencl_c_container() { return R( // ########################## begin of O
 )+"#endif"+R( // FORCE_FIELD
 
 )+"#ifdef PARTICLES"+R(
-)+R(kernel void integrate_particles)+"("+R(global float* particles, const global float* u, const float time_step_multiplicator // ) {
+)+R(float3 particle_boundary_force(const float3 p, const global uchar* flags) { // normalized pseudo-force to prevent particles from entering solid boundaries or exiting fluid phase
+	const float xa=p.x-0.5f+1.5f*def_Nx, ya=p.y-0.5f+1.5f*def_Ny, za=p.z-0.5f+1.5f*def_Nz; // subtract lattice offsets
+	const uint xb=(uint)xa, yb=(uint)ya, zb=(uint)za; // integer casting to find bottom left corner
+	const float x1=xa-(float)xb, y1=ya-(float)yb, z1=za-(float)zb; // calculate interpolation factors
+	float3 boundary_force = (float3)(0.0f, 0.0f, 0.0f);
+	float boundary_distance = 2.0f;
+	for(uint c=0u; c<8u; c++) { // count over eight corner points
+		const uint i=(c&0x04u)>>2, j=(c&0x02u)>>1, k=c&0x01u; // disassemble c into corner indices ijk
+		const uint x=(xb+i)%def_Nx, y=(yb+j)%def_Ny, z=(zb+k)%def_Nz; // calculate corner lattice positions
+		const uint n = x+(y+z*def_Ny)*def_Nx; // calculate lattice linear index
+		if(flags[n]&(TYPE_S|TYPE_G)) {
+			boundary_force += (float3)(0.5f, 0.5f, 0.5f)-(float3)((float)i, (float)j, (float)k);
+			boundary_distance = fmin(boundary_distance, length((float3)(x1, y1, z1)-(float3)((float)i, (float)j, (float)k)));
+		}
+	}
+	const float particle_radius = 0.5f; // has to be between 0.0f and 0.5f, default: 0.5f (hydrodynamic radius)
+	return boundary_distance-0.5f<particle_radius ? normalize(boundary_force) : (float3)(0.0f, 0.0f, 0.0f);
+} // particle_boundary_force()
+
+)+R(kernel void integrate_particles)+"("+R(global float* particles, const global float* u, const global uchar* flags, const float time_step_multiplicator // ) {
 )+"#ifdef FORCE_FIELD"+R(
 	, volatile global float* F, const float fx, const float fy, const float fz
 )+"#endif"+R( // FORCE_FIELD
@@ -1937,7 +1956,9 @@ string opencl_c_container() { return R( // ########################## begin of O
 		spread_force(F, p0, Fn); // do force spreading
 	}
 )+"#endif"+R( // FORCE_FIELD
-	const float3 un = interpolate_u(mirror_position(p0), u)*time_step_multiplicator; // trilinear interpolation of velocity at point p
+	const float3 p0_mirrored = mirror_position(p0);
+	float3 un = interpolate_u(p0_mirrored, u); // trilinear interpolation of velocity at point p
+	un = (un+length(un)*particle_boundary_force(p0_mirrored, flags))*time_step_multiplicator;
 	const float3 p = mirror_position(p0+un); // advect particles
 	particles[                           n] = p.x;
 	particles[    def_particles_N+(ulong)n] = p.y;
@@ -2323,16 +2344,16 @@ string opencl_c_container() { return R( // ########################## begin of O
 		const float Fnl = length(Fn);
 		if(Fnl>0.0f) {
 			const int c = iron_color(255.0f*Fnl); // color boundaries depending on the force on them
-			draw_line(p, p+5.0f*Fn, c, camera_cache, bitmap, zbuffer); // draw colored force vectors
+			draw_line(p, p+Fn, c, camera_cache, bitmap, zbuffer); // draw colored force vectors
 		}
 	}
 )+"#endif"+R( // FORCE_FIELD
-}/**/
+}
 
-/*)+"#ifndef FORCE_FIELD"+R( // render solid boundaries with marching-cubes
-)+R(kernel void graphics_flags(const global uchar* flags, const global float* camera, global int* bitmap, global int* zbuffer) {
+)+"#ifndef FORCE_FIELD"+R( // render solid boundaries with marching-cubes
+)+R(kernel void graphics_flags_mc(const global uchar* flags, const global float* camera, global int* bitmap, global int* zbuffer) {
 )+"#else"+R( // FORCE_FIELD
-)+R(kernel void graphics_flags(const global uchar* flags, const global float* camera, global int* bitmap, global int* zbuffer, const global float* F) {
+)+R(kernel void graphics_flags_mc(const global uchar* flags, const global float* camera, global int* bitmap, global int* zbuffer, const global float* F) {
 )+"#endif"+R( // FORCE_FIELD
 	const uint n = get_global_id(0);
 	if(n>=(uint)def_N||is_halo(n)) return; // don't execute graphics_flags() on halo
@@ -2362,31 +2383,43 @@ string opencl_c_container() { return R( // ########################## begin of O
 	float camera_cache[15]; // cache camera parameters in case the kernel draws more than one shape
 	for(uint i=0u; i<15u; i++) camera_cache[i] = camera[i];
 	const float3 offset = (float3)((float)xyz.x+0.5f-0.5f*(float)def_Nx, (float)xyz.y+0.5f-0.5f*(float)def_Ny, (float)xyz.z+0.5f-0.5f*(float)def_Nz);
+)+"#ifdef FORCE_FIELD"+R(
+	float3 Fj[8];
+	for(uint i=0u; i<8u; i++) Fj[i] = v[i]==1.0f ? (float3)(F[j[i]], F[def_N+(ulong)j[i]], F[2ul*def_N+(ulong)j[i]]) : (float3)(0.0f, 0.0f, 0.0f);
+)+"#endif"+R( // FORCE_FIELD
 	for(uint i=0u; i<tn; i++) {
-		const float3 p0 = triangles[3u*i   ]+offset;
-		const float3 p1 = triangles[3u*i+1u]+offset;
-		const float3 p2 = triangles[3u*i+2u]+offset;
-		const float3 p=(p0+p1+p2)/3.0f, normal=cross(p1-p0, p2-p0);
-		const int c = lighting(191<<16|191<<8|191, p, normal, camera_cache);
-		draw_triangle(p0, p1, p2, c, camera_cache, bitmap, zbuffer);
-	}
+		const float3 p0 = triangles[3u*i   ];
+		const float3 p1 = triangles[3u*i+1u];
+		const float3 p2 = triangles[3u*i+2u];
+		const float3 normal = normalize(cross(p1-p0, p2-p0));
 )+"#ifdef FORCE_FIELD"+R(
-	const uchar flagsn_bo = flags[n]&TYPE_BO;
-	const float3 p = position(xyz);
-	if(flagsn_bo==TYPE_S) {
-		const float3 Fn = def_scale_F*(float3)(F[n], F[def_N+(ulong)n], F[2ul*def_N+(ulong)n]);
-		const float Fnl = length(Fn);
-		if(Fnl>0.0f) {
-			const int c = iron_color(255.0f*Fnl); // color boundaries depending on the force on them
-			draw_line(p, p+5.0f*Fn, c, camera_cache, bitmap, zbuffer); // draw colored force vectors
+		int c0, c1, c2; {
+			const float x1=p0.x, y1=p0.y, z1=p0.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
+			const float3 Fi = (x0*y0*z0)*Fj[0]+(x1*y0*z0)*Fj[1]+(x1*y0*z1)*Fj[2]+(x0*y0*z1)*Fj[3]+(x0*y1*z0)*Fj[4]+(x1*y1*z0)*Fj[5]+(x1*y1*z1)*Fj[6]+(x0*y1*z1)*Fj[7]; // perform trilinear interpolation
+			c0 = shading(rainbow_color(191.0f+255.0f*def_scale_F*dot(Fi, normal)), p0+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(Fi));
+		} {
+			const float x1=p1.x, y1=p1.y, z1=p1.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
+			const float3 Fi = (x0*y0*z0)*Fj[0]+(x1*y0*z0)*Fj[1]+(x1*y0*z1)*Fj[2]+(x0*y0*z1)*Fj[3]+(x0*y1*z0)*Fj[4]+(x1*y1*z0)*Fj[5]+(x1*y1*z1)*Fj[6]+(x0*y1*z1)*Fj[7]; // perform trilinear interpolation
+			c1 = shading(rainbow_color(191.0f+255.0f*def_scale_F*dot(Fi, normal)), p1+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(Fi));
+		} {
+			const float x1=p2.x, y1=p2.y, z1=p2.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
+			const float3 Fi = (x0*y0*z0)*Fj[0]+(x1*y0*z0)*Fj[1]+(x1*y0*z1)*Fj[2]+(x0*y0*z1)*Fj[3]+(x0*y1*z0)*Fj[4]+(x1*y1*z0)*Fj[5]+(x1*y1*z1)*Fj[6]+(x0*y1*z1)*Fj[7]; // perform trilinear interpolation
+			c2 = shading(rainbow_color(191.0f+255.0f*def_scale_F*dot(Fi, normal)), p2+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(Fi));
 		}
-	}
+		draw_triangle_interpolated(p0+offset, p1+offset, p2+offset, c0, c1, c2, camera_cache, bitmap, zbuffer); // draw triangle with interpolated colors
+)+"#else"+R( // FORCE_FIELD
+		const int c = shading(191<<16|191<<8|191, (p0+p1+p2)/3.0f+offset, normal, camera_cache);
+		draw_triangle(p0+offset, p1+offset, p2+offset, c, camera_cache, bitmap, zbuffer);
 )+"#endif"+R( // FORCE_FIELD
-}/**/
+	}
+}
 
-)+R(kernel void graphics_field(const global uchar* flags, const global float* u, const global float* camera, global int* bitmap, global int* zbuffer) {
+)+R(kernel void graphics_field(const global uchar* flags, const global float* u, const global float* camera, global int* bitmap, global int* zbuffer, const int slice_mode, const int slice_x, const int slice_y, const int slice_z) {
 	const uint n = get_global_id(0);
 	if(n>=(uint)def_N||is_halo(n)) return; // don't execute graphics_field() on halo
+	const uint3 xyz = coordinates(n);
+	const bool rx=(int)xyz.x!=slice_x, ry=(int)xyz.y!=slice_y, rz=(int)xyz.z!=slice_z;
+	if((slice_mode==1&&rx)||(slice_mode==2&&ry)||(slice_mode==3&&rz)||(slice_mode==4&&rx&&rz)||(slice_mode==5&&rx&&ry&&rz)||(slice_mode==6&&ry&&rz)||(slice_mode==7&&rx&&ry)) return;
 )+"#ifndef MOVING_BOUNDARIES"+R(
 	if(flags[n]&(TYPE_S|TYPE_E|TYPE_I|TYPE_G)) return;
 )+"#else"+R( // EQUILIBRIUM_BOUNDARIES
@@ -2403,9 +2436,9 @@ string opencl_c_container() { return R( // ########################## begin of O
 }
 
 )+"#ifndef GRAPHICS_TEMPERATURE"+R(
-)+R(kernel void graphics_streamline(const global uchar* flags, const global float* u, const global float* camera, global int* bitmap, global int* zbuffer) {
+)+R(kernel void graphics_streamline(const global uchar* flags, const global float* u, const global float* camera, global int* bitmap, global int* zbuffer, const int slice_mode, const int slice_x, const int slice_y, const int slice_z) {
 )+"#else"+R( // GRAPHICS_TEMPERATURE
-)+R(kernel void graphics_streamline(const global uchar* flags, const global float* u, const global float* camera, global int* bitmap, global int* zbuffer, const global float* T) {
+)+R(kernel void graphics_streamline(const global uchar* flags, const global float* u, const global float* camera, global int* bitmap, global int* zbuffer, const int slice_mode, const int slice_x, const int slice_y, const int slice_z, const global float* T) {
 )+"#endif"+R( // GRAPHICS_TEMPERATURE
 	const uint n = get_global_id(0);
 )+"#ifndef D2Q9"+R(
@@ -2415,12 +2448,15 @@ string opencl_c_container() { return R( // ########################## begin of O
 	const uint y = t/(def_Nx/def_streamline_sparse);
 	const uint x = t%(def_Nx/def_streamline_sparse);
 	float3 p = (float)def_streamline_sparse*((float3)((float)x+0.5f, (float)y+0.5f, (float)z+0.5f))-0.5f*((float3)((float)def_Nx, (float)def_Ny, (float)def_Nz));
+	const bool rx=abs((int)(x*def_streamline_sparse+def_streamline_sparse/2u)-slice_x)>(int)def_streamline_sparse/2, ry=abs((int)(y*def_streamline_sparse+def_streamline_sparse/2u)-slice_y)>(int)def_streamline_sparse/2, rz=abs((int)(z*def_streamline_sparse+def_streamline_sparse/2u)-slice_z)>(int)def_streamline_sparse/2;
 )+"#else"+R( // D2Q9
 	if(n>=(def_Nx/def_streamline_sparse)*(def_Ny/def_streamline_sparse)) return;
 	const uint y = n/(def_Nx/def_streamline_sparse); // disassemble 1D index to 3D coordinates
 	const uint x = n%(def_Nx/def_streamline_sparse);
 	float3 p = ((float3)((float)def_streamline_sparse*((float)x+0.5f), (float)def_streamline_sparse*((float)y+0.5f), 0.5f))-0.5f*((float3)((float)def_Nx, (float)def_Ny, (float)def_Nz));
+	const bool rx=abs((int)(x*def_streamline_sparse+def_streamline_sparse/2u)-slice_x)>(int)def_streamline_sparse/2, ry=abs((int)(y*def_streamline_sparse+def_streamline_sparse/2u)-slice_y)>(int)def_streamline_sparse/2, rz=true;
 )+"#endif"+R( // D2Q9
+	if((slice_mode==1&&rx)||(slice_mode==2&&ry)||(slice_mode==3&&rz)||(slice_mode==4&&rx&&rz)||(slice_mode==5&&rx&&ry&&rz)||(slice_mode==6&&ry&&rz)||(slice_mode==7&&rx&&ry)) return;
 	float camera_cache[15]; // cache camera parameters in case the kernel draws more than one shape
 	for(uint i=0u; i<15u; i++) camera_cache[i] = camera[i];
 	const float hLx=0.5f*(float)(def_Nx-2u*(def_Dx>1u)), hLy=0.5f*(float)(def_Ny-2u*(def_Dy>1u)), hLz=0.5f*(float)(def_Nz-2u*(def_Dz>1u));
@@ -2545,20 +2581,19 @@ string opencl_c_container() { return R( // ########################## begin of O
 		const float3 p0 = triangles[3u*i   ]; // triangle coordinates in [0,1] (local cell)
 		const float3 p1 = triangles[3u*i+1u];
 		const float3 p2 = triangles[3u*i+2u];
-		const float3 normal = cross(p1-p0, p2-p0);
-		int c0, c1, c2;
-		{
+		const float3 normal = normalize(cross(p1-p0, p2-p0));
+		int c0, c1, c2; {
 			const float x1=p0.x, y1=p0.y, z1=p0.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
 			const float3 ui = (x0*y0*z0)*uj[0]+(x1*y0*z0)*uj[1]+(x1*y0*z1)*uj[2]+(x0*y0*z1)*uj[3]+(x0*y1*z0)*uj[4]+(x1*y1*z0)*uj[5]+(x1*y1*z1)*uj[6]+(x0*y1*z1)*uj[7]; // perform trilinear interpolation
-			c0 = lighting(rainbow_color(255.0f*def_scale_u*length(ui)), p0+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(ui));
+			c0 = shading(rainbow_color(255.0f*def_scale_u*length(ui)), p0+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(ui));
 		} {
 			const float x1=p1.x, y1=p1.y, z1=p1.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
 			const float3 ui = (x0*y0*z0)*uj[0]+(x1*y0*z0)*uj[1]+(x1*y0*z1)*uj[2]+(x0*y0*z1)*uj[3]+(x0*y1*z0)*uj[4]+(x1*y1*z0)*uj[5]+(x1*y1*z1)*uj[6]+(x0*y1*z1)*uj[7]; // perform trilinear interpolation
-			c1 = lighting(rainbow_color(255.0f*def_scale_u*length(ui)), p1+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(ui));
+			c1 = shading(rainbow_color(255.0f*def_scale_u*length(ui)), p1+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(ui));
 		} {
 			const float x1=p2.x, y1=p2.y, z1=p2.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
 			const float3 ui = (x0*y0*z0)*uj[0]+(x1*y0*z0)*uj[1]+(x1*y0*z1)*uj[2]+(x0*y0*z1)*uj[3]+(x0*y1*z0)*uj[4]+(x1*y1*z0)*uj[5]+(x1*y1*z1)*uj[6]+(x0*y1*z1)*uj[7]; // perform trilinear interpolation
-			c2 = lighting(rainbow_color(255.0f*def_scale_u*length(ui)), p2+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(ui));
+			c2 = shading(rainbow_color(255.0f*def_scale_u*length(ui)), p2+offset, normal, camera_cache); // rainbow_color(255.0f*def_scale_u*length(ui));
 		}
 		draw_triangle_interpolated(p0+offset, p1+offset, p2+offset, c0, c1, c2, camera_cache, bitmap, zbuffer); // draw triangle with interpolated colors
 	}
@@ -2597,7 +2632,7 @@ string opencl_c_container() { return R( // ########################## begin of O
 		const float3 p1 = triangles[3u*i+1u]+offset;
 		const float3 p2 = triangles[3u*i+2u]+offset;
 		const float3 p=(p0+p1+p2)/3.0f, normal=cross(p1-p0, p2-p0);
-		const int c = lighting(55<<16|155<<8|255, p, normal, camera_cache);
+		const int c = shading(55<<16|155<<8|255, p, normal, camera_cache);
 		draw_triangle(p0, p1, p2, c, camera_cache, bitmap, zbuffer);
 		//draw_line(p0, p1, c, camera_cache, bitmap, zbuffer); // wireframe rendering
 		//draw_line(p0, p2, c, camera_cache, bitmap, zbuffer);
diff --git a/src/lbm.cpp b/src/lbm.cpp
index ee18ed0a..c82e7d26 100644
--- a/src/lbm.cpp
+++ b/src/lbm.cpp
@@ -102,7 +102,7 @@ void LBM_Domain::allocate(Device& device) {
 
 #ifdef PARTICLES
 	particles = Memory<float>(device, (ulong)particles_N, 3u);
-	kernel_integrate_particles = Kernel(device, (ulong)particles_N, "integrate_particles", particles, u, 1.0f);
+	kernel_integrate_particles = Kernel(device, (ulong)particles_N, "integrate_particles", particles, u, flags, 1.0f);
 #ifdef FORCE_FIELD
 	kernel_integrate_particles.add_parameters(F, fx, fy, fz);
 #endif // FORCE_FIELD
@@ -153,9 +153,9 @@ void LBM_Domain::enqueue_update_moving_boundaries() { // mark/unmark nodes next
 void LBM_Domain::enqueue_integrate_particles(const uint time_step_multiplicator) { // intgegrate particles forward in time and couple particles to fluid
 #ifdef FORCE_FIELD
 	if(particles_rho!=1.0f) kernel_reset_force_field.enqueue_run(); // only reset force field if particles have buoyancy and apply forces on fluid
-	kernel_integrate_particles.set_parameters(4u, fx, fy, fz);
+	kernel_integrate_particles.set_parameters(5u, fx, fy, fz);
 #endif // FORCE_FIELD
-	kernel_integrate_particles.set_parameters(2u, (float)time_step_multiplicator).enqueue_run();
+	kernel_integrate_particles.set_parameters(3u, (float)time_step_multiplicator).enqueue_run();
 }
 #endif // PARTICLES
 
@@ -373,16 +373,18 @@ void LBM_Domain::Graphics::allocate(Device& device) {
 	kernel_clear = Kernel(device, bitmap.length(), "graphics_clear", bitmap, zbuffer);
 
 	kernel_graphics_flags = Kernel(device, lbm->get_N(), "graphics_flags", lbm->flags, camera_parameters, bitmap, zbuffer);
-	kernel_graphics_field = Kernel(device, lbm->get_N(), "graphics_field", lbm->flags, lbm->u, camera_parameters, bitmap, zbuffer);
+	kernel_graphics_flags_mc = Kernel(device, lbm->get_N(), "graphics_flags_mc", lbm->flags, camera_parameters, bitmap, zbuffer);
+	kernel_graphics_field = Kernel(device, lbm->get_N(), "graphics_field", lbm->flags, lbm->u, camera_parameters, bitmap, zbuffer, 0, 0, 0, 0);
 #ifndef D2Q9
-	kernel_graphics_streamline = Kernel(device, (lbm->get_Nx()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Ny()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Nz()/GRAPHICS_STREAMLINE_SPARSE), "graphics_streamline", lbm->flags, lbm->u, camera_parameters, bitmap, zbuffer); // 3D
+	kernel_graphics_streamline = Kernel(device, (lbm->get_Nx()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Ny()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Nz()/GRAPHICS_STREAMLINE_SPARSE), "graphics_streamline", lbm->flags, lbm->u, camera_parameters, bitmap, zbuffer, 0, 0, 0, 0); // 3D
 #else // D2Q9
-	kernel_graphics_streamline = Kernel(device, (lbm->get_Nx()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Ny()/GRAPHICS_STREAMLINE_SPARSE), "graphics_streamline", lbm->flags, lbm->u, camera_parameters, bitmap, zbuffer); // 2D
+	kernel_graphics_streamline = Kernel(device, (lbm->get_Nx()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Ny()/GRAPHICS_STREAMLINE_SPARSE), "graphics_streamline", lbm->flags, lbm->u, camera_parameters, bitmap, zbuffer, 0, 0, 0, 0); // 2D
 #endif // D2Q9
 	kernel_graphics_q = Kernel(device, lbm->get_N(), "graphics_q", lbm->flags, lbm->u, camera_parameters, bitmap, zbuffer);
 
 #ifdef FORCE_FIELD
 	kernel_graphics_flags.add_parameters(lbm->F);
+	kernel_graphics_flags_mc.add_parameters(lbm->F);
 #endif // FORCE_FIELD
 
 #ifdef SURFACE
@@ -410,7 +412,7 @@ bool LBM_Domain::Graphics::update_camera() {
 	}
 	return change; // return false if camera parameters remain unchanged
 }
-void LBM_Domain::Graphics::enqueue_draw_frame() {
+void LBM_Domain::Graphics::enqueue_draw_frame(const int visualization_modes, const int slice_mode, const int slice_x, const int slice_y, const int slice_z) {
 	const bool camera_update = update_camera();
 #if defined(INTERACTIVE_GRAPHICS)||defined(INTERACTIVE_GRAPHICS_ASCII)
 	if(!camera_update&&!camera.key_update&&lbm->get_t()==t_last_frame) return; // don't render a new frame if the scene hasn't changed since last frame
@@ -420,15 +422,16 @@ void LBM_Domain::Graphics::enqueue_draw_frame() {
 	if(camera_update) camera_parameters.enqueue_write_to_device(); // camera_parameters PCIe transfer and kernel_clear execution can happen simulataneously
 	kernel_clear.enqueue_run();
 #ifdef SURFACE
-	if(key_6&&lbm->get_D()==1u) kernel_graphics_raytrace_phi.enqueue_run(); // disable raytracing for multi-GPU (domain decomposition rendering doesn't work for raytracing)
-	if(key_5) kernel_graphics_rasterize_phi.enqueue_run();
+	if((visualization_modes&0b01000000)&&lbm->get_D()==1u) kernel_graphics_raytrace_phi.enqueue_run(); // disable raytracing for multi-GPU (domain decomposition rendering doesn't work for raytracing)
+	if(visualization_modes&0b00100000) kernel_graphics_rasterize_phi.enqueue_run();
 #endif // SURFACE
-	if(key_1) kernel_graphics_flags.enqueue_run();
-	if(key_2) kernel_graphics_field.enqueue_run();
-	if(key_3) kernel_graphics_streamline.enqueue_run();
-	if(key_4) kernel_graphics_q.enqueue_run();
+	if((visualization_modes&0b11)==1||(visualization_modes&0b11)==2) kernel_graphics_flags.enqueue_run();
+	if((visualization_modes&0b11)==2||(visualization_modes&0b11)==3) kernel_graphics_flags_mc.enqueue_run();
+	if(visualization_modes&0b00000100) kernel_graphics_field.set_parameters(5u, slice_mode, slice_x-lbm->Ox, slice_y-lbm->Oy, slice_z-lbm->Oz).enqueue_run();
+	if(visualization_modes&0b00001000) kernel_graphics_streamline.set_parameters(5u, slice_mode, slice_x-lbm->Ox, slice_y-lbm->Oy, slice_z-lbm->Oz).enqueue_run();
+	if(visualization_modes&0b00010000) kernel_graphics_q.enqueue_run();
 #ifdef PARTICLES
-	if(key_7) kernel_graphics_particles.enqueue_run();
+	if(visualization_modes&0b10000000) kernel_graphics_particles.enqueue_run();
 #endif // PARTICLES
 	bitmap.enqueue_read_from_device();
 	if(lbm->get_D()>1u) zbuffer.enqueue_read_from_device();
@@ -447,7 +450,7 @@ string LBM_Domain::Graphics::device_defines() const { return
 	"\n	#define def_screen_height "    +to_string(camera.height)+"u"
 	"\n	#define def_scale_u "          +to_string(1.0f/(0.57735027f*(GRAPHICS_U_MAX)))+"f"
 	"\n	#define def_scale_Q_min "      +to_string(GRAPHICS_Q_CRITERION)+"f"
-	"\n	#define def_scale_F "          +to_string(GRAPHICS_BOUNDARY_FORCE_SCALE)+"f"
+	"\n	#define def_scale_F "          +to_string(1.0f/(GRAPHICS_F_MAX))+"f"
 	"\n	#define def_streamline_sparse "+to_string(GRAPHICS_STREAMLINE_SPARSE)+"u"
 	"\n	#define def_streamline_length "+to_string(GRAPHICS_STREAMLINE_LENGTH)+"u"
 	"\n	#define def_n "                +to_string(1.333f)+"f" // refractive index of water for raytracing graphics
@@ -891,6 +894,40 @@ void LBM::unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // rem
 	for(uint d=0u; d<get_D(); d++) lbm[d]->enqueue_unvoxelize_mesh_on_device(mesh, flag);
 	for(uint d=0u; d<get_D(); d++) lbm[d]->finish_queue();
 }
+void LBM::write_mesh_to_vtk(const Mesh* mesh, const string& path) { // write mesh to binary .vtk file
+	const string header_1 = "# vtk DataFile Version 3.0\nData\nBINARY\nDATASET POLYDATA\nPOINTS "+to_string(3u*mesh->triangle_number)+" float\n";
+	const string header_2 = "POLYGONS "+to_string(mesh->triangle_number)+" "+to_string(4u*mesh->triangle_number)+"\n";
+	float* points = new float[9u*mesh->triangle_number];
+	int* triangles = new int[4u*mesh->triangle_number];
+	for(uint i=0u; i<mesh->triangle_number; i++) {
+		points[9u*i   ] = reverse_bytes(mesh->p0[i].x-center().x);
+		points[9u*i+1u] = reverse_bytes(mesh->p0[i].y-center().y);
+		points[9u*i+2u] = reverse_bytes(mesh->p0[i].z-center().z);
+		points[9u*i+3u] = reverse_bytes(mesh->p1[i].x-center().x);
+		points[9u*i+4u] = reverse_bytes(mesh->p1[i].y-center().y);
+		points[9u*i+5u] = reverse_bytes(mesh->p1[i].z-center().z);
+		points[9u*i+6u] = reverse_bytes(mesh->p2[i].x-center().x);
+		points[9u*i+7u] = reverse_bytes(mesh->p2[i].y-center().y);
+		points[9u*i+8u] = reverse_bytes(mesh->p2[i].z-center().z);
+		triangles[4u*i   ] = reverse_bytes(3); // 3 vertices per triangle
+		triangles[4u*i+1u] = reverse_bytes(3*(int)i  ); // vertex 0
+		triangles[4u*i+2u] = reverse_bytes(3*(int)i+1); // vertex 1
+		triangles[4u*i+3u] = reverse_bytes(3*(int)i+2); // vertex 2
+	}
+	const string filename = default_filename(path, "mesh", ".vtk", get_t());
+	create_folder(filename);
+	std::ofstream file(filename, std::ios::out|std::ios::binary);
+	file.write(header_1.c_str(), header_1.length()); // write non-binary file header
+	file.write((char*)points, 4u*9u*mesh->triangle_number); // write binary data
+	file.write(header_2.c_str(), header_2.length()); // write non-binary file header
+	file.write((char*)triangles, 4u*4u*mesh->triangle_number); // write binary data
+	file.close();
+	delete[] points;
+	delete[] triangles;
+	info.allow_rendering = false; // temporarily disable interactive rendering
+	print_info("File \""+filename+"\" saved.");
+	info.allow_rendering = true;
+}
 void LBM::voxelize_stl(const string& path, const float3& center, const float3x3& rotation, const float size, const uchar flag) { // voxelize triangle mesh
 	const Mesh* mesh = read_stl(path, this->size(), center, rotation, size);
 	flags.write_to_device();
@@ -911,12 +948,37 @@ void LBM::voxelize_stl(const string& path, const float size, const uchar flag) {
 #ifdef GRAPHICS
 int* LBM::Graphics::draw_frame() {
 #ifndef UPDATE_FIELDS
-	if(key_2||key_3||key_4) {
+	if(visualization_modes&0b00011100) {
 		for(uint d=0u; d<lbm->get_D(); d++) lbm->lbm[d]->enqueue_update_fields(); // only call update_fields() if the time step has changed since the last rendered frame
 		//for(uint d=0u; d<lbm->get_D(); d++) lbm->communicate_rho_u_flags();
 	}
 #endif // UPDATE_FIELDS
-	for(uint d=0u; d<lbm->get_D(); d++) lbm->lbm[d]->graphics.enqueue_draw_frame();
+
+	if(key_1) { visualization_modes = (visualization_modes&~0b11)|(((visualization_modes&0b11)+1)%4); key_1 = false; }
+	if(key_2) { visualization_modes ^= 0b00000100; key_2 = false; }
+	if(key_3) { visualization_modes ^= 0b00001000; key_3 = false; }
+	if(key_4) { visualization_modes ^= 0b00010000; key_4 = false; }
+	if(key_5) { visualization_modes ^= 0b00100000; key_5 = false; }
+	if(key_6) { visualization_modes ^= 0b01000000; key_6 = false; }
+	if(key_7) { visualization_modes ^= 0b10000000; key_7 = false; }
+
+	if(key_T) {
+		slice_mode = (slice_mode+1)%8; key_T = false;
+	}
+	if(slice_mode==1u) {
+		if(key_Q) { slice_x = clamp(slice_x-1, 0, (int)lbm->get_Nx()-1); key_Q = false; }
+		if(key_E) { slice_x = clamp(slice_x+1, 0, (int)lbm->get_Nx()-1); key_E = false; }
+	}
+	if(slice_mode==2u) {
+		if(key_Q) { slice_y = clamp(slice_y-1, 0, (int)lbm->get_Ny()-1); key_Q = false; }
+		if(key_E) { slice_y = clamp(slice_y+1, 0, (int)lbm->get_Ny()-1); key_E = false; }
+	}
+	if(slice_mode==3u) {
+		if(key_Q) { slice_z = clamp(slice_z-1, 0, (int)lbm->get_Nz()-1); key_Q = false; }
+		if(key_E) { slice_z = clamp(slice_z+1, 0, (int)lbm->get_Nz()-1); key_E = false; }
+	}
+
+	for(uint d=0u; d<lbm->get_D(); d++) lbm->lbm[d]->graphics.enqueue_draw_frame(visualization_modes, slice_mode, slice_x, slice_y, slice_z);
 	for(uint d=0u; d<lbm->get_D(); d++) lbm->lbm[d]->finish_queue();
 	int* bitmap = lbm->lbm[0]->graphics.get_bitmap();
 	int* zbuffer = lbm->lbm[0]->graphics.get_zbuffer();
diff --git a/src/lbm.hpp b/src/lbm.hpp
index 7d2f6590..62e0b6c2 100644
--- a/src/lbm.hpp
+++ b/src/lbm.hpp
@@ -142,7 +142,8 @@ class LBM_Domain {
 		Memory<float> camera_parameters; // contains camera position, rotation, field of view etc.
 
 		LBM_Domain* lbm = nullptr;
-		Kernel kernel_graphics_flags; // render flag lattice
+		Kernel kernel_graphics_flags; // render flag lattice with wireframe
+		Kernel kernel_graphics_flags_mc; // render flag lattice with marching-cubes
 		Kernel kernel_graphics_field; // render a colored velocity vector for each node
 		Kernel kernel_graphics_streamline; // render streamlines
 		Kernel kernel_graphics_q; // render vorticity (Q-criterion)
@@ -179,7 +180,7 @@ class LBM_Domain {
 			return *this;
 		}
 		void allocate(Device& device); // allocate memory for bitmap and zbuffer
-		void enqueue_draw_frame(); // main rendering function, calls rendering kernels
+		void enqueue_draw_frame(const int visualization_modes, const int slice_mode=0, const int slice_x=0, const int slice_y=0, const int slice_z=0); // main rendering function, calls rendering kernels
 		int* get_bitmap(); // returns pointer to bitmap
 		int* get_zbuffer(); // returns pointer to zbuffer
 		string device_defines() const; // returns preprocessor constants for embedding in OpenCL C code
@@ -265,15 +266,15 @@ class LBM {
 					data[i*(ulong)dimensions()+(ulong)d] = reverse_bytes(reference(i, d)); // SoA <- AoS
 				}
 			}
-			create_folder(path);
 			const string filename = create_file_extension(path, ".vtk");
+			create_folder(filename);
 			std::ofstream file(filename, std::ios::out|std::ios::binary);
 			file.write(header.c_str(), header.length()); // write non-binary file header
 			file.write((char*)data, capacity()); // write binary data
 			file.close();
 			delete[] data;
 			info.allow_rendering = false; // temporarily disable interactive rendering
-			print_info("File \""+path+"\" saved.");
+			print_info("File \""+filename+"\" saved.");
 			info.allow_rendering = true;
 		}
 
@@ -473,6 +474,7 @@ class LBM {
 
 	void voxelize_mesh_on_device(const Mesh* mesh, const uchar flag=TYPE_S, const float3& rotation_center=float3(0.0f), const float3& linear_velocity=float3(0.0f), const float3& rotational_velocity=float3(0.0f)); // voxelize mesh
 	void unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag=TYPE_S); // remove voxelized triangle mesh from LBM grid
+	void write_mesh_to_vtk(const Mesh* mesh, const string& path=""); // write mesh to binary .vtk file
 	void voxelize_stl(const string& path, const float3& center, const float3x3& rotation, const float size=0.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file
 	void voxelize_stl(const string& path, const float3x3& rotation, const float size=0.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file (place in box center)
 	void voxelize_stl(const string& path, const float3& center, const float size=0.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file (no rotation)
@@ -491,10 +493,15 @@ class LBM {
 		}
 
 	public:
+		int visualization_modes=0, slice_mode=0, slice_x=0, slice_y=0, slice_z=0; // slice visualization: mode = { 0 (no slice), 1 (x), 2 (y), 3 (z), 4 (xz), 5 (xyz), 7 (yz), 7 (xy) }, slice_{xyz} = position of slices
+
 		Graphics() {} // default constructor
 		Graphics(LBM* lbm) {
 			this->lbm = lbm;
 			camera.set_zoom(0.5f*(float)fmax(fmax(lbm->get_Nx(), lbm->get_Ny()), lbm->get_Nz()));
+			slice_x = (int)lbm->get_Nx()/2;
+			slice_y = (int)lbm->get_Ny()/2;
+			slice_z = (int)lbm->get_Nz()/2;
 			default_settings();
 		}
 		~Graphics() { // destructor must wait for all encoder threads to finish
@@ -510,6 +517,11 @@ class LBM {
 		}
 		Graphics& operator=(const Graphics& graphics) { // copy assignment
 			lbm = graphics.lbm;
+			visualization_modes = graphics.visualization_modes;
+			slice_mode = graphics.slice_mode;
+			slice_x = graphics.slice_x;
+			slice_y = graphics.slice_y;
+			slice_z = graphics.slice_z;
 			return *this;
 		}
 
diff --git a/src/main.cpp b/src/main.cpp
index 364a0c07..b03f29bb 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -40,16 +40,34 @@ void main_label(const double frametime) {
 #endif // PARTICLES
 			const int ox=2, oy=2;
 			int i = 0;
+
+			const int mode = info.lbm->graphics.visualization_modes;
+			string mode_1 = (mode&3)==0 ? "inactive" : (mode&3)==1 ? " flags  " : (mode&3)==2 ? "  both  " : " solid  ";
+			string mode_2 = mode&0b00000100 ? " active " : "inactive";
+			string mode_3 = mode&0b00001000 ? " active " : "inactive";
+			string mode_4 = mode&0b00010000 ? " active " : "inactive";
+			string mode_5 = surface ? (mode&0b00100000 ? " active " : "inactive") : "disabled";
+			string mode_6 = surface&&info.lbm->get_D()==1u ? (mode&0b01000000 ? " active " : "inactive") : "disabled";
+			string mode_7 = particles ? (mode&0b10000000 ? " active " : "inactive") : "disabled";
+
+			const int sl = info.lbm->graphics.slice_mode;
+			const string sx = "x="+alignr(4u, info.lbm->graphics.slice_x);
+			const string sy = "y="+alignr(4u, info.lbm->graphics.slice_y);
+			const string sz = "z="+alignr(4u, info.lbm->graphics.slice_z);
+			string slice = sl==0 ? "      disabled      " : sl==1 ? sx+"|      |      " : sl==2 ? "      |"+sy+"|      " : sl==3 ? "      |      |"+sz : sl==4 ? sx+"|      |"+sz : sl==5 ? sx+"|"+sy+"|"+sz : sl==6 ? "      |"+sy+"|"+sz : sx+"|"+sy+"|      ";
+
 			draw_label(ox, oy+i, "Keyboard/Mouse Controls: ", c); i+=2*FONT_HEIGHT;
 			draw_label(ox, oy+i, "P ("+string(key_P?"running ":" paused ")+"): start/pause simulation", c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "H ("+string(key_H?" shown  ":" hidden ")+"): show/hide help", c); i+=2*FONT_HEIGHT;
-			draw_label(ox, oy+i, "1 ("+string(key_1?" active ":"inactive")+"): flags (and force vectors on solid boundary nodes if the extension is used)", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "2 ("+string(key_2?" active ":"inactive")+"): velocity field", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "3 ("+string(key_3?" active ":"inactive")+"): streamlines", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "4 ("+string(key_4?" active ":"inactive")+"): vorticity / velocity-colored Q-criterion isosurface", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "5 ("+string(surface ? (key_5?" active ":"inactive") : "disabled")+"): rasterized free surface", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "6 ("+string(surface&&info.lbm->get_D()==1u ? (key_6?" active ":"inactive") : "disabled")+"): raytraced free surface", c); i+=FONT_HEIGHT;
-			draw_label(ox, oy+i, "7 ("+string(particles ? (key_7?" active ":"inactive") : "disabled")+"): particles", c); i+=2*FONT_HEIGHT;
+			draw_label(ox, oy+i, "1 ("+mode_1+"): flag wireframe / solid surface (and force vectors on solid cells or surface pressure if the extension is used)", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "2 ("+mode_2+"): velocity field", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "3 ("+mode_3+"): streamlines", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "4 ("+mode_4+"): vorticity / velocity-colored Q-criterion isosurface", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "5 ("+mode_5+"): rasterized free surface", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "6 ("+mode_6+"): raytraced free surface", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "7 ("+mode_7+"): particles", c); i+=2*FONT_HEIGHT;
+			draw_label(ox, oy+i, "T: ("+slice+"): toggle slice visualization mode", c); i+=FONT_HEIGHT;
+			draw_label(ox, oy+i, "Q/E: move slice in slice visualization mode", c); i+=2*FONT_HEIGHT;
 			draw_label(ox, oy+i, "Mouse or I/J/K/L (rx="+alignr(4u, to_int(fmod(degrees(camera.rx)+90.0+360.0, 360.0)-180.0))+", ry="+alignr(3u, to_int(180.0-degrees(camera.ry)))+"): rotate camera", c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "Scrollwheel or +/- ("+to_string(camera.free ? (float)camera.free_camera_velocity : camera.zoom*(float)fmax(fmax(info.lbm->get_Nx(), info.lbm->get_Ny()), info.lbm->get_Nz())/(float)min(camera.width, camera.height), 3u)+"): zoom (centered camera mode) or camera movement speed (free camera mode)", c); i+=FONT_HEIGHT;
 			draw_label(ox, oy+i, "Mouseclick or U: toggle rotation with Mouse and angle snap rotation with I/J/K/L", c); i+=FONT_HEIGHT;