diff --git a/README.md b/README.md
index 7969c827..06f1caba 100644
--- a/README.md
+++ b/README.md
@@ -235,7 +235,7 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
   - CPUs
   - even smartphone ARM GPUs
 - supports parallelization across multiple GPUs on a single node (PC/laptop/server) with PCIe communication, no SLI/Crossfire/NVLink/InfinityFabric or MPI installation required; the GPUs don't even have to be from the same vendor, but similar memory capacity and bandwidth is recommended
-- supports importing and voxelizing triangle meshes from binary `.stl` files
+- supports importing and voxelizing triangle meshes from binary `.stl` files, with fast GPU voxelization
 - supports exporting volumetric data as binary `.vtk` files
 - supports exporting rendered frames as `.png`/`.qoi`/`.bmp` files; time-consuming image encoding is handled in parallel on the CPU while the simulation on GPU can continue without delay
 
@@ -246,9 +246,9 @@ $$f_j(i\\%2\\ ?\\ \vec{x}+\vec{e}_i\\ :\\ \vec{x},\\ t+\Delta t)=f_i^\textrm{tem
 Here are [performance benchmarks](https://doi.org/10.3390/computation10060092) on various hardware in MLUPs/s, or how many million lattice points are updated per second. The settings used for the benchmark are D3Q19 SRT with no extensions enabled (only LBM with implicit mid-grid bounce-back boundaries) and the setup consists of an empty cubic box with sufficient size (typically 256³). Without extensions, a single lattice point requires:
 - a memory capacity of 93 (FP32/FP32) or 55 (FP32/FP16) Bytes
 - a memory bandwidth of 153 (FP32/FP32) or 77 (FP32/FP16) Bytes per time step
-- 360 (FP32/FP32) or 403 (FP32/FP16S) or 1272 (FP32/FP16C) FLOPs per time step (FP32+INT32 operations counted combined)
+- 363 (FP32/FP32) or 406 (FP32/FP16S) or 1275 (FP32/FP16C) FLOPs per time step (FP32+INT32 operations counted combined)
 
-In consequence, the arithmetic intensity of this implementation is 2.35 (FP32/FP32) or 5.23 (FP32/FP16S) or 16.52 (FP32/FP16C) FLOPs/Byte. So performance is only limited by memory bandwidth.
+In consequence, the arithmetic intensity of this implementation is 2.37 (FP32/FP32) or 5.27 (FP32/FP16S) or 16.56 (FP32/FP16C) FLOPs/Byte. So performance is only limited by memory bandwidth.
 
 If your GPU is not on the list yet, you can report your benchmarks [here](https://github.com/ProjectPhysX/FluidX3D/issues/8).
 
@@ -338,7 +338,7 @@ If your GPU is not on the list yet, you can report your benchmarks [here](https:
 
 ## Multi-GPU Benchmarks
 
-Multi-GPU benchmarks are done at the largest possible grid resolution with a cubic domain, and either 2x1x1, 2x2x1 or 2x2x2 of these cubic domains together. The multiplicator numbers in brackets are relative to single-GPU results.
+Multi-GPU benchmarks are done at the largest possible grid resolution with a cubic domain, and either 2x1x1, 2x2x1 or 2x2x2 of these cubic domains together. The percentages in brackets are single-GPU roofline model efficiency, and the multiplicator numbers in brackets are scaling factors relative to benchmarked single-GPU performance.
 
 | Device                        | FP32<br>[TFlops/s] | Mem<br>[GB] | BW<br>[GB/s] | FP32/FP32<br>[MLUPs/s] | FP32/FP16S<br>[MLUPs/s] | FP32/FP16C<br>[MLUPs/s] |
 | :---------------------------- | -----------------: | ----------: | -----------: | ---------------------: | ----------------------: | ----------------------: |
@@ -348,12 +348,22 @@ Multi-GPU benchmarks are done at the largest possible grid resolution with a cub
 | 2x AMD Instinct MI250 (4 GCD) |             181.04 |         256 |         6554 |           16925 (3.0x) |            29163 (3.2x) |            29627 (3.5x) |
 | 4x AMD Instinct MI250 (8 GCD) |             362.08 |         512 |        13107 |           27350 (4.9x) |            52258 (5.8x) |            53521 (6.3x) |
 |                               |                    |             |              |                        |                         |                         |
+| 1x AMD Radeon VII             |              13.83 |          16 |         1024 |             4898 (73%) |              7778 (58%) |              5256 (40%) |
+| 2x AMD Radeon VII             |              27.66 |          32 |         2048 |            8113 (1.7x) |            15591 (2.0x) |            10352 (2.0x) |
+| 4x AMD Radeon VII             |              55.32 |          64 |         4096 |           12911 (2.6x) |            24273 (3.1x) |            17080 (3.2x) |
+|                               |                    |             |              |                        |                         |                         |
 | 1x Nvidia A100 SXM4 40GB      |              19.49 |          40 |         1555 |             8522 (84%) |             16013 (79%) |             11251 (56%) |
 | 2x Nvidia A100 SXM4 40GB      |              38.98 |          80 |         3110 |           13629 (1.6x) |            24620 (1.5x) |            18850 (1.7x) |
 | 4x Nvidia A100 SXM4 40GB      |              77.96 |         160 |         6220 |           17978 (2.1x) |            30604 (1.9x) |            30627 (2.7x) |
 |                               |                    |             |              |                        |                         |                         |
+| 1x Nvidia Tesla K40m          |               4.29 |          12 |          288 |             1131 (60%) |              1868 (50%) |               912 (24%) |
+| 2x Nvidia Tesla K40m          |               8.58 |          24 |          577 |            1971 (1.7x) |             3300 (1.8x) |             1801 (2.0x) |
+|                               |                    |             |              |                        |                         |                         |
 | 1x Nvidia Quadro RTX 8000 Pa. |              14.93 |          48 |          624 |             2591 (64%) |              5408 (67%) |              5607 (69%) |
 | 2x Nvidia Quadro RTX 8000 Pa. |              29.86 |          96 |         1248 |            4767 (1.8x) |             9607 (1.8x) |            10214 (1.8x) |
+|                               |                    |             |              |                        |                         |                         |
+| 1x Nvidia GeForce RTX 2080 Ti |              13.45 |          11 |          616 |             3194 (79%) |              6700 (84%) |              6853 (86%) |
+| 2x Nvidia GeForce RTX 2080 Ti |              26.90 |          22 |         1232 |            5085 (1.6x) |            10770 (1.6x) |            10922 (1.6x) |
 
 
 
@@ -402,7 +412,7 @@ Multi-GPU benchmarks are done at the largest possible grid resolution with a cub
 
 - <details><summary>Why no multi-relaxation-time (MRT) collision operator?</summary><br>The idea of MRT is to linearly transform the DDFs into "moment space" by matrix multiplication and relax these moments individually, promising better stability and accuracy. In practice, in the vast majority of cases, it has zero or even negative effects on stability and accuracy, and simple SRT is much superior. Apart from the kinematic shear viscosity and conserved terms, the remaining moments are non-physical quantities and their tuning is a blackbox. Although MRT can be implemented in an efficient manner with only a single matrix-vector multiplication in registers, leading to identical performance compared to SRT by remaining bandwidth-bound, storing the matrices vastly elongates and over-complicates the code for no real benefit.<br><br></details>
 
-- <details><summary>Can FluidX3D run on multiple GPUs at the same time?</summary><br>Yes. The simulation grid is then split in domains, one for each GPU (domain decomposition method). The GPUs essentially pool their memory, enabling much larger simulation grids and higher performance. Rendering is parallelized across multiple GPUs as well; each GPU renders its own domain with a 3D offset, then rendered frames from all GPUs are overlayed with their z-buffers. Communication between domains is done over PCIe, so no SLI/Crossfire/NVLink/InfinityFabric is required. All GPUs must however be installed in the same node (PC/laptop/server). Even unholy combinations of Nvidia/AMD/Intel GPUs will work, although it is recommended to only use GPUs with similar memory capacity and bandwidth together. Using a fast gaming GPU and slow integrated GPU together would only decrease performance due to communication overhead.<br><br></details>
+- <details><summary>Can FluidX3D run on multiple GPUs at the same time?</summary><br>Yes. The simulation grid is then split in domains, one for each GPU (domain decomposition method). The GPUs essentially pool their memory, enabling much larger grid resolution and higher performance. Rendering is parallelized across multiple GPUs as well; each GPU renders its own domain with a 3D offset, then rendered frames from all GPUs are overlayed with their z-buffers. Communication between domains is done over PCIe, so no SLI/Crossfire/NVLink/InfinityFabric is required. All GPUs must however be installed in the same node (PC/laptop/server). Even unholy combinations of Nvidia/AMD/Intel GPUs will work, although it is recommended to only use GPUs with similar memory capacity and bandwidth together. Using a fast gaming GPU and slow integrated GPU together would only decrease performance due to communication overhead.<br><br></details>
 
 ### Hardware
 
diff --git a/src/kernel.cpp b/src/kernel.cpp
index 51933433..36c2355c 100644
--- a/src/kernel.cpp
+++ b/src/kernel.cpp
@@ -1885,69 +1885,6 @@ string opencl_c_container() { return R( // ########################## begin of O
 
 
 
-)+R(kernel void voxelize_mesh(global uchar* flags, const uchar flag, const global float* p0, const global float* p1, const global float* p2, const uint triangle_number, float x0, float y0, float z0, float x1, float y1, float z1) { // voxelize triangle mesh
-	const uint n = get_global_id(0); // n = x+(y+z*Ny)*Nx
-	if(n>=(uint)def_N) return;
-	const float3 p = position(coordinates(n))+(float3)(0.5f*(float)((def_Nx-2u*(def_Dx>1u))*def_Dx)-0.5f, 0.5f*(float)((def_Ny-2u*(def_Dy>1u))*def_Dy)-0.5f, 0.5f*(float)((def_Nz-2u*(def_Dz>1u))*def_Dz)-0.5f)+(float3)(def_domain_offset_x, def_domain_offset_y, def_domain_offset_z);
-	const float3 r0_origin = p;
-	const float3 r1_origin = p;
-	const float3 r0_direction = (float3)(+0.01f, +0.04f, +1.03f); // from each grid point, shoot an outward ray and count how often it intersects the mesh, odd number -> grid point is inside mesh
-	const float3 r1_direction = (float3)(-0.05f, -0.06f, -1.07f); // to eliminate errors, repeat with a second ray in a different random direction
-	uint intersections_0=0u, intersections_1=0u;
-	/*if(p.x<x0||p.y<y0||p.z<z0||p.x>x1||p.y>y1||p.z>z1) return; // return straight away if grid point is outside the bounds of the mesh (~4x faster)
-	for(uint i=0u; i<triangle_number; i++) {
-		const float3 p0i = (float3)(p0[3u*i], p0[3u*i+1u], p0[3u*i+2u]);
-		const float3 p1i = (float3)(p1[3u*i], p1[3u*i+1u], p1[3u*i+2u]);
-		const float3 p2i = (float3)(p2[3u*i], p2[3u*i+1u], p2[3u*i+2u]);
-		const float3 u=p1i-p0i, v=p2i-p0i;
-		{
-			const float3 w=r0_origin-p0i, h=cross(r0_direction, v), q=cross(w, u);
-			const float f=1.0f/dot(u, h), s=f*dot(w, h), t=f*dot(r0_direction, q);
-			intersections_0 += (uint)(s>=0.0f&&s<=1.0f&&t>=0.0f&&s+t<=1.0f&&f*dot(v, q)>0.0f);
-		} {
-			const float3 w=r1_origin-p0i, h=cross(r1_direction, v), q=cross(w, u);
-			const float f=1.0f/dot(u, h), s=f*dot(w, h), t=f*dot(r1_direction, q);
-			intersections_1 += (uint)(s>=0.0f&&s<=1.0f&&t>=0.0f&&s+t<=1.0f&&f*dot(v, q)>0.0f);
-		}
-	}/**/
-	const bool condition = p.x<x0||p.y<y0||p.z<z0||p.x>x1||p.y>y1||p.z>z1; // use local memory (~25% faster)
-	volatile local uint workgroup_condition;
-	workgroup_condition = 1u;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	atomic_and(&workgroup_condition, (uint)condition);
-	barrier(CLK_LOCAL_MEM_FENCE);
-	const bool workgroup_all = (bool)workgroup_condition;
-	if(workgroup_all) return; // return straight away if grid point is outside the bounds of the mesh (~4x faster)
-	const uint lid = get_local_id(0);
-	local float3 cache_p0[def_workgroup_size];
-	local float3 cache_p1[def_workgroup_size];
-	local float3 cache_p2[def_workgroup_size];
-	for(uint i=0u; i<triangle_number; i+=def_workgroup_size) {
-		const uint tx=3u*(i+lid), ty=tx+1u, tz=ty+1u;
-		cache_p0[lid] = (float3)(p0[tx], p0[ty], p0[tz]);
-		cache_p1[lid] = (float3)(p1[tx], p1[ty], p1[tz]);
-		cache_p2[lid] = (float3)(p2[tx], p2[ty], p2[tz]);
-		barrier(CLK_LOCAL_MEM_FENCE);
-		for(int j=0; j<def_workgroup_size&&i+j<triangle_number; j++) {
-			const float3 p0i=cache_p0[j], p1i=cache_p1[j], p2i=cache_p2[j];
-			const float3 u=p1i-p0i, v=p2i-p0i;
-			{
-				const float3 w=r0_origin-p0i, h=cross(r0_direction, v), q=cross(w, u);
-				const float f=1.0f/dot(u, h), s=f*dot(w, h), t=f*dot(r0_direction, q);
-				intersections_0 += (uint)(s>=0.0f&&s<=1.0f&&t>=0.0f&&s+t<=1.0f&&f*dot(v, q)>0.0f);
-			} {
-				const float3 w=r1_origin-p0i, h=cross(r1_direction, v), q=cross(w, u);
-				const float f=1.0f/dot(u, h), s=f*dot(w, h), t=f*dot(r1_direction, q);
-				intersections_1 += (uint)(s>=0.0f&&s<=1.0f&&t>=0.0f&&s+t<=1.0f&&f*dot(v, q)>0.0f);
-			}
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}/**/
-	if(intersections_0%2u&&intersections_1%2u) flags[n] = flag;
-} // voxelize_mesh()
-
-
-
 )+R(uint get_area(const uint direction) {
 	const uint A[3] = { def_Ax, def_Ay, def_Az };
 	return A[direction];
@@ -1969,8 +1906,8 @@ string opencl_c_container() { return R( // ########################## begin of O
 	return index(coordinates[direction]);
 }
 
-uint index_transfer(const uint side_i) {
-	const uint index_transfer_data[2u*def_dimensions*def_transfers] = {
+)+R(uint index_transfer(const uint side_i) {
+	const uchar index_transfer_data[2u*def_dimensions*def_transfers] = {
 )+"#if defined(D2Q9)"+R(
 		1,  5,  7, // xp
 		2,  6,  8, // xm
@@ -1999,7 +1936,7 @@ uint index_transfer(const uint side_i) {
 		6, 10, 12, 15, 17, 20, 21, 24, 26  // zm
 )+"#endif"+R( // D3Q27
 	};
-	return index_transfer_data[side_i];
+	return (uint)index_transfer_data[side_i];
 }
 )+R(void extract_fi(const uint a, const uint n, const uint side, const ulong t, global fpxx_copy* transfer_buffer, const global fpxx_copy* fi) {
 	uint j[def_velocity_set]; // neighbor indices
@@ -2128,6 +2065,101 @@ uint index_transfer(const uint side_i) {
 
 
 
+)+R(kernel void voxelize_mesh(const uint direction, global uchar* flags, const uchar flag, const global float* p0, const global float* p1, const global float* p2, const uint triangle_number, float x0, float y0, float z0, float x1, float y1, float z1) { // voxelize triangle mesh
+	const uint a=get_global_id(0), A=get_area(direction); // a = domain area index for each side, A = area of the domain boundary
+	if(a>=A) return; // area might not be a multiple of def_workgroup_size, so return here to avoid writing in unallocated memory space
+	const uint3 xyz = direction==0u ? (uint3)((uint)max(0, (int)x0-def_Ox), a%def_Ny, a/def_Ny) : direction==1u ? (uint3)(a/def_Nz, (uint)max(0, (int)y0-def_Oy), a%def_Nz) : (uint3)(a%def_Nx, a/def_Nx, (uint)max(0, (int)z0-def_Oz));
+	const float3 p = position(xyz);
+	const float3 offset = (float3)(0.5f*(float)((def_Nx-2u*(def_Dx>1u))*def_Dx)-0.5f, 0.5f*(float)((def_Ny-2u*(def_Dy>1u))*def_Dy)-0.5f, 0.5f*(float)((def_Nz-2u*(def_Dz>1u))*def_Dz)-0.5f)+(float3)(def_domain_offset_x, def_domain_offset_y, def_domain_offset_z);
+	const float3 r_origin = p+offset;
+	const float3 r_direction = (float3)((float)(direction==0u), (float)(direction==1u), (float)(direction==2u));
+	uint intersections=0u, intersections_check=0u;
+	ushort distances[64]; // allow up to 64 mesh intersections
+	const bool condition = direction==0u ? r_origin.y<y0||r_origin.z<z0||r_origin.y>y1||r_origin.z>z1 : direction==1u ? r_origin.x<x0||r_origin.z<z0||r_origin.x>x1||r_origin.z>z1 : r_origin.x<x0||r_origin.y<y0||r_origin.x>x1||r_origin.y>y1;
+
+	volatile local uint workgroup_condition; // use local memory optimization (~25% faster)
+	workgroup_condition = 1u;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_and(&workgroup_condition, (uint)condition);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	const bool workgroup_all = (bool)workgroup_condition;
+	if(workgroup_all) return; // return immediately if grid point is outside the bounding box of the mesh
+	const uint lid = get_local_id(0);
+	local float3 cache_p0[def_workgroup_size];
+	local float3 cache_p1[def_workgroup_size];
+	local float3 cache_p2[def_workgroup_size];
+	for(uint i=0u; i<triangle_number; i+=def_workgroup_size) {
+		const uint tx=3u*(i+lid), ty=tx+1u, tz=ty+1u;
+		cache_p0[lid] = (float3)(p0[tx], p0[ty], p0[tz]);
+		cache_p1[lid] = (float3)(p1[tx], p1[ty], p1[tz]);
+		cache_p2[lid] = (float3)(p2[tx], p2[ty], p2[tz]);
+		barrier(CLK_LOCAL_MEM_FENCE);
+		for(int j=0; j<def_workgroup_size&&i+j<triangle_number; j++) {
+			const float3 p0i=cache_p0[j], p1i=cache_p1[j], p2i=cache_p2[j];
+			const float3 u=p1i-p0i, v=p2i-p0i, w=r_origin-p0i, h=cross(r_direction, v), q=cross(w, u); // bidirectional ray-triangle intersection (Moeller-Trumbore algorithm)
+			const float f=1.0f/dot(u, h), s=f*dot(w, h), t=f*dot(r_direction, q), d=f*dot(v, q);
+			if(s>=0.0f&&s<1.0f&&t>=0.0f&&s+t<1.0f) { // ray-triangle intersection ahead or behind
+				if(d>0.0f) { // ray-triangle intersection ahead
+					if(intersections<64u&&d<65536.0f) distances[intersections] = (ushort)d; // store distance to intersection in array as ushort
+					intersections++;
+				} else { // ray-triangle intersection behind
+					intersections_check++; // cast a second ray to check if starting point is really inside (error correction)
+				}
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}/**/
+
+	/*if(condition) return; // don't use local memory (this also runs on old OpenCL 1.0 GPUs)
+	for(uint i=0u; i<triangle_number; i++) {
+		const float3 p0i = (float3)(p0[3u*i], p0[3u*i+1u], p0[3u*i+2u]);
+		const float3 p1i = (float3)(p1[3u*i], p1[3u*i+1u], p1[3u*i+2u]);
+		const float3 p2i = (float3)(p2[3u*i], p2[3u*i+1u], p2[3u*i+2u]);
+		const float3 u=p1i-p0i, v=p2i-p0i, w=r_origin-p0i, h=cross(r_direction, v), q=cross(w, u); // bidirectional ray-triangle intersection (Moeller-Trumbore algorithm)
+		const float f=1.0f/dot(u, h), s=f*dot(w, h), t=f*dot(r_direction, q), d=f*dot(v, q);
+		if(s>=0.0f&&s<1.0f&&t>=0.0f&&s+t<1.0f) { // ray-triangle intersection ahead or behind
+			if(d>0.0f) { // ray-triangle intersection ahead
+				if(intersections<64u&&d<65536.0f) distances[intersections] = (ushort)d; // store distance to intersection in array as ushort
+				intersections++;
+			} else { // ray-triangle intersection behind
+				intersections_check++; // cast a second ray to check if starting point is really inside (error correction)
+			}
+		}
+	}/**/
+
+	if(intersections==0u) return; // no intersection for the entire column, so return immediately
+	bool inside = (intersections%2u)&&(intersections_check%2u);
+
+	for(int i=1; i<(int)intersections; i++) { // insertion sort of distances
+		ushort t = distances[i];
+		int j = i-1;
+		while(distances[j]>t&&j>=0) {
+			distances[j+1] = distances[j];
+			j--;
+		}
+		distances[j+1] = t;
+	}
+
+	uint intersection = 0u; // iterate through column
+	const uint h0 = direction==0u ? xyz.x : direction==1u ? xyz.y : xyz.z;
+	for(uint h=h0; h<min(h0+(uint)distances[intersections-1u], (uint)def_N/A); h++) {
+		while(intersection<intersections&&h>h0+(uint)distances[intersection]) {
+			inside = !inside; // passed mesh intersection, so switch inside/outside state
+			intersection++;
+		}
+		const ulong n = index((uint3)(direction==0u?h:xyz.x, direction==1u?h:xyz.y, direction==2u?h:xyz.z));
+		if(inside) flags[n] |= flag;
+	}
+} // voxelize_mesh()
+
+)+R(kernel void unvoxelize_mesh(global uchar* flags, const uchar flag, float x0, float y0, float z0, float x1, float y1, float z1) { // remove voxelized triangle mesh
+	const uint n = get_global_id(0);
+	const float3 p = position(coordinates(n))+(float3)(0.5f*(float)((def_Nx-2u*(def_Dx>1u))*def_Dx)-0.5f, 0.5f*(float)((def_Ny-2u*(def_Dy>1u))*def_Dy)-0.5f, 0.5f*(float)((def_Nz-2u*(def_Dz>1u))*def_Dz)-0.5f)+(float3)(def_domain_offset_x, def_domain_offset_y, def_domain_offset_z);
+	if(p.x>=x0-1.0f&&p.y>=y0-1.0f&&p.z>=z0-1.0f&&p.x<=x1+1.0f&&p.y<=y1+1.0f&&p.z<=z1+1.0f) flags[n] &= ~flag;
+} // unvoxelize_mesh()
+
+
+
 // ################################################## graphics code ##################################################
 
 )+"#ifdef GRAPHICS"+R(
diff --git a/src/lbm.cpp b/src/lbm.cpp
index c7dbe548..5128c920 100644
--- a/src/lbm.cpp
+++ b/src/lbm.cpp
@@ -164,18 +164,31 @@ uint LBM_Domain::get_velocity_set() const {
 	return velocity_set;
 }
 
-void LBM_Domain::voxelize_mesh(const Mesh* mesh, const uchar flag) { // voxelize triangle mesh
+void LBM_Domain::voxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // voxelize triangle mesh
 	Memory<float3> p0(device, mesh->triangle_number, 1u, mesh->p0);
 	Memory<float3> p1(device, mesh->triangle_number, 1u, mesh->p1);
 	Memory<float3> p2(device, mesh->triangle_number, 1u, mesh->p2);
 	const float x0=mesh->pmin.x, y0=mesh->pmin.y, z0=mesh->pmin.z, x1=mesh->pmax.x, y1=mesh->pmax.y, z1=mesh->pmax.z; // use bounding box of mesh to speed up voxelization
-	Kernel kernel_voxelize_mesh(device, get_N(), "voxelize_mesh", flags, flag, p0, p1, p2, mesh->triangle_number, x0, y0, z0, x1, y1, z1);
+	const float M[3] = { (y1-y0)*(z1-z0), (z1-z0)*(x1-x0), (x1-x0)*(y1-y0) };
+	float Mmin = M[0];
+	uint direction = 0u;
+	for(uint i=1u; i<3u; i++) {
+		if(M[i]<Mmin) {
+			Mmin = M[i];
+			direction = i; // find direction of minimum bounding-box cross-section area
+		}
+	}
+	const ulong A[3] = { (ulong)Ny*(ulong)Nz, (ulong)Nz*(ulong)Nx, (ulong)Nx*(ulong)Ny };
+	Kernel kernel_voxelize_mesh(device, A[direction], "voxelize_mesh", direction, flags, flag, p0, p1, p2, mesh->triangle_number, x0, y0, z0, x1, y1, z1);
 	p0.write_to_device();
 	p1.write_to_device();
 	p2.write_to_device();
-	flags.write_to_device();
 	kernel_voxelize_mesh.run();
-	flags.read_from_device();
+}
+void LBM_Domain::enqueue_unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // remove voxelized triangle mesh from LBM grid
+	const float x0=mesh->pmin.x, y0=mesh->pmin.y, z0=mesh->pmin.z, x1=mesh->pmax.x, y1=mesh->pmax.y, z1=mesh->pmax.z; // remove all flags in bounding box of mesh
+	Kernel kernel_unvoxelize_mesh(device, get_N(), "unvoxelize_mesh", flags, flag, x0, y0, z0, x1, y1, z1);
+	kernel_unvoxelize_mesh.run();
 }
 
 string LBM_Domain::device_defines() const { return
@@ -188,13 +201,17 @@ string LBM_Domain::device_defines() const { return
 	"\n	#define def_Dy "+to_string(Dy)+"u"
 	"\n	#define def_Dz "+to_string(Dz)+"u"
 
+	"\n	#define def_Ox "+to_string(Ox)+"" // offsets are signed integer!
+	"\n	#define def_Oy "+to_string(Oy)+""
+	"\n	#define def_Oz "+to_string(Oz)+""
+
 	"\n	#define def_Ax "+to_string(Ny*Nz)+"u"
 	"\n	#define def_Ay "+to_string(Nz*Nx)+"u"
 	"\n	#define def_Az "+to_string(Nx*Ny)+"u"
 
-	"\n	#define def_domain_offset_x "+to_string((float)Ox+(float)(Dx>1u)-0.5f*(((float)Dx-1.0f)*(float)(Nx-2u*(Dx>1u))))+"f"
-	"\n	#define def_domain_offset_y "+to_string((float)Oy+(float)(Dy>1u)-0.5f*(((float)Dy-1.0f)*(float)(Ny-2u*(Dy>1u))))+"f"
-	"\n	#define def_domain_offset_z "+to_string((float)Oz+(float)(Dz>1u)-0.5f*(((float)Dz-1.0f)*(float)(Nz-2u*(Dz>1u))))+"f"
+	"\n	#define def_domain_offset_x "+to_string((float)Ox+(float)(Dx>1u)-0.5f*((float)Dx-1.0f)*(float)(Nx-2u*(Dx>1u)))+"f"
+	"\n	#define def_domain_offset_y "+to_string((float)Oy+(float)(Dy>1u)-0.5f*((float)Dy-1.0f)*(float)(Ny-2u*(Dy>1u)))+"f"
+	"\n	#define def_domain_offset_z "+to_string((float)Oz+(float)(Dz>1u)-0.5f*((float)Dz-1.0f)*(float)(Nz-2u*(Dz>1u)))+"f"
 
 	"\n	#define D"+to_string(dimensions)+"Q"+to_string(velocity_set)+"" // D2Q9/D3Q15/D3Q19/D3Q27
 	"\n	#define def_velocity_set "+to_string(velocity_set)+"u" // LBM velocity set (D2Q9/D3Q15/D3Q19/D3Q27)
@@ -343,7 +360,7 @@ void LBM_Domain::Graphics::enqueue_draw_frame() {
 	const bool camera_update = update_camera();
 #if defined(INTERACTIVE_GRAPHICS)||defined(INTERACTIVE_GRAPHICS_ASCII)
 	if(!camera_update&&!camera.key_update&&lbm->get_t()==t_last_frame) return; // don't render a new frame if the scene hasn't changed since last frame
-#endif // INTERACTIVE_GRAPHICS or INTERACTIVE_GRAPHICS_ASCII
+#endif // INTERACTIVE_GRAPHICS||INTERACTIVE_GRAPHICS_ASCII
 	t_last_frame = lbm->get_t();
 	camera.key_update = false;
 	if(camera_update) camera_parameters.enqueue_write_to_device(); // camera_parameters PCIe transfer and kernel_clear execution can happen simulataneously
@@ -733,16 +750,25 @@ void LBM::write_status(const string& path) { // write LBM status report to a .tx
 	write_file(filename, status);
 }
 
-void LBM::voxelize_mesh(const Mesh* mesh, const uchar flag) { // voxelize triangle mesh
-	print_info("Voxelizing mesh. This may take a few minutes."); // if this crashes on Windows, create a TdrDelay 32-bit DWORD with decimal value 300 in Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers
-	{ thread* threads=new thread[get_D()]; for(uint d=0u; d<get_D(); d++) threads[d]=thread([=]() {
-		lbm[d]->voxelize_mesh(mesh, flag);
-	}); for(uint d=0u; d<get_D(); d++) threads[d].join(); delete[] threads; }
+void LBM::voxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // voxelize triangle mesh
+	if(get_D()==1u) {
+		lbm[0]->voxelize_mesh_on_device(mesh, flag); // if this crashes on Windows, create a TdrDelay 32-bit DWORD with decimal value 300 in Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers
+	} else {
+		thread* threads=new thread[get_D()]; for(uint d=0u; d<get_D(); d++) threads[d]=thread([=]() {
+			lbm[d]->voxelize_mesh_on_device(mesh, flag);
+		}); for(uint d=0u; d<get_D(); d++) threads[d].join(); delete[] threads;
+	}
+}
+void LBM::unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // remove voxelized triangle mesh from LBM grid
+	for(uint d=0u; d<get_D(); d++) lbm[d]->enqueue_unvoxelize_mesh_on_device(mesh, flag);
+	for(uint d=0u; d<get_D(); d++) lbm[d]->finish_queue();
 }
 void LBM::voxelize_stl(const string& path, const float3& center, const float3x3& rotation, const float size, const uchar flag) { // voxelize triangle mesh
 	const Mesh* mesh = read_stl(path, float3(get_Nx(), get_Ny(), get_Nz()), center, rotation, size);
-	voxelize_mesh(mesh, flag);
+	flags.write_to_device();
+	voxelize_mesh_on_device(mesh, flag);
 	delete mesh;
+	flags.read_from_device();
 }
 void LBM::voxelize_stl(const string& path, const float3x3& rotation, const float size, const uchar flag) { // read and voxelize binary .stl file (place in box center)
 	voxelize_stl(path, center(), rotation, size, flag);
diff --git a/src/lbm.hpp b/src/lbm.hpp
index f7944b18..4b0eb758 100644
--- a/src/lbm.hpp
+++ b/src/lbm.hpp
@@ -121,7 +121,8 @@ class LBM_Domain {
 	void set_fz(const float fz) { this->fz = fz; } // set global froce per volume
 	void set_f(const float fx, const float fy, const float fz) { set_fx(fx); set_fy(fy); set_fz(fz); } // set global froce per volume
 
-	void voxelize_mesh(const Mesh* mesh, const uchar flag=TYPE_S); // voxelize mesh
+	void voxelize_mesh_on_device(const Mesh* mesh, const uchar flag=TYPE_S); // voxelize mesh
+	void enqueue_unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag=TYPE_S); // remove voxelized triangle mesh from LBM grid
 
 	#ifdef GRAPHICS
 	class Graphics {
@@ -472,7 +473,8 @@ class LBM {
 	}
 	void write_status(const string& path=""); // write LBM status report to a .txt file
 
-	void voxelize_mesh(const Mesh* mesh, const uchar flag=TYPE_S); // voxelize mesh
+	void voxelize_mesh_on_device(const Mesh* mesh, const uchar flag=TYPE_S); // voxelize mesh
+	void unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag=TYPE_S); // remove voxelized triangle mesh from LBM grid
 	void voxelize_stl(const string& path, const float3& center, const float3x3& rotation, const float size=-1.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file
 	void voxelize_stl(const string& path, const float3x3& rotation, const float size=-1.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file (place in box center)
 	void voxelize_stl(const string& path, const float3& center, const float size=-1.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file (no rotation)
diff --git a/src/setup.cpp b/src/setup.cpp
index 7ec31892..50ad9a0b 100644
--- a/src/setup.cpp
+++ b/src/setup.cpp
@@ -237,7 +237,6 @@
 		else lbm.u.y[n] = u;
 		if(x==0u||x==Nx-1u||y==0u||y==Ny-1u||z==0u||z==Nz-1u) lbm.flags[n] = TYPE_E; // all non periodic
 	}	// #########################################################################################################################################################################################
-	//voxelize_triangle(lbm, p0, p1, p2, TYPE_S);
 	lbm.run();
 } /**/
 
@@ -341,8 +340,7 @@
 	const float size = 1.0f*(float)L;
 	const float3 center = float3(lbm.center().x, 32.0f+0.5f*size, lbm.center().z);
 	const float3x3 rotation = float3x3(float3(0, 0, 1), radians(180.0f));
-	voxelize_stl_hull(lbm, get_exe_path()+"../stl/X-wing.stl", center, rotation, size); // https://www.thingiverse.com/thing:353276/files
-	//lbm.voxelize_stl(get_exe_path()+"../stl/X-wing.stl", center, rotation, size); // https://www.thingiverse.com/thing:353276/files
+	lbm.voxelize_stl(get_exe_path()+"../stl/X-wing.stl", center, rotation, size); // https://www.thingiverse.com/thing:353276/files
 	const ulong N=lbm.get_N(); uint Nx=lbm.get_Nx(), Ny=lbm.get_Ny(), Nz=lbm.get_Nz(); for(ulong n=0ull; n<N; n++) { uint x=0u, y=0u, z=0u; lbm.coordinates(n, x, y, z);
 		// ########################################################################### define geometry #############################################################################################
 		if(lbm.flags[n]!=TYPE_S) lbm.u.y[n] = u;
@@ -368,15 +366,7 @@
 
 
 
-/*std::atomic_bool revoxelizing = false;
-void revoxelize(LBM* lbm, Mesh* mesh) { // voxelize new frames in detached thread in parallel while LBM is running
-	for(uint n=0u; n<lbm->get_N(); n++) lbm->flags[n] &= ~TYPE_S; // clear flags
-	const float3x3 rotation = float3x3(float3(0.2f, 1.0f, 0.1f), radians(0.4032f)); // create rotation matrix to rotate mesh
-	mesh->rotate(rotation); // rotate mesh
-	voxelize_mesh_hull(*lbm, mesh, TYPE_S); // voxelize rotated mesh in lbm.flags
-	revoxelizing = false; // indicate new voxelizer thread has finished
-}
-void main_setup() { // Star Wars TIE fighter
+/*void main_setup() { // Star Wars TIE fighter
 	// ######################################################### define simulation box size, viscosity and volume force ############################################################################
 	const uint L = 256u;
 	const float Re = 100000.0f;
@@ -387,7 +377,8 @@ void main_setup() { // Star Wars TIE fighter
 	const float3 center = float3(lbm.center().x, 0.6f*size, lbm.center().z);
 	const float3x3 rotation = float3x3(float3(1, 0, 0), radians(90.0f));
 	Mesh* mesh = read_stl(get_exe_path()+"../stl/TIE-fighter.stl", lbm.size(), center, rotation, size); // https://www.thingiverse.com/thing:2919109/files
-	voxelize_mesh_hull(lbm, mesh, TYPE_S);
+	lbm.voxelize_mesh_on_device(mesh);
+	lbm.flags.read_from_device();
 	const ulong N=lbm.get_N(); uint Nx=lbm.get_Nx(), Ny=lbm.get_Ny(), Nz=lbm.get_Nz(); for(ulong n=0ull; n<N; n++) { uint x=0u, y=0u, z=0u; lbm.coordinates(n, x, y, z);
 		// ########################################################################### define geometry #############################################################################################
 		if(lbm.flags[n]!=TYPE_S) lbm.u.y[n] = u;
@@ -405,12 +396,11 @@ void main_setup() { // Star Wars TIE fighter
 	//	lbm.graphics.write_frame_png(get_exe_path()+"export/f/");
 	//	lbm.graphics.set_camera_free(float3(2.5f*(float)Nx, 0.0f*(float)Ny, 0.0f*(float)Nz), 0.0f, 0.0f, 50.0f);
 	//	lbm.graphics.write_frame_png(get_exe_path()+"export/s/");
-		while(revoxelizing.load()) sleep(0.01f); // wait for voxelizer thread to finish
-		lbm.flags.write_to_device(); // lbm.flags on host is finished, write to device now
-		revoxelizing = true; // indicate new voxelizer thread is starting
-		thread voxelizer(revoxelize, &lbm, mesh); // start new voxelizer thread
-		voxelizer.detach(); // detatch voxelizer thread so LBM can run in parallel
 		lbm.run(28u); // run LBM in parallel while CPU is voxelizing the next frame
+		const float3x3 rotation = float3x3(float3(0.2f, 1.0f, 0.1f), radians(0.4032f)); // create rotation matrix to rotate mesh
+		lbm.unvoxelize_mesh_on_device(mesh);
+		mesh->rotate(rotation); // rotate mesh
+		lbm.voxelize_mesh_on_device(mesh);
 	}
 	//write_file(get_exe_path()+"time.txt", print_time(clock.stop()));
 } /**/
diff --git a/src/shapes.cpp b/src/shapes.cpp
index 8ac562b6..3b219ac4 100644
--- a/src/shapes.cpp
+++ b/src/shapes.cpp
@@ -151,53 +151,4 @@ float plane_plic(const uint x, const uint y, const uint z, const float3& p, cons
 	/**/ if(d0<=-dl) return -1.0f;
 	else if(d0>= dl) return  1.0f;
 	else return plic_cube_inverse(d0, nn);
-}
-
-void voxelize_line(LBM& lbm, const float3& p0, const float3& p1, const uchar flag) { // voxelize line
-	const float xa=p0.x+0.5f, ya=p0.y+0.5f, za=p0.z+0.5f; // start point
-	const float xb=p1.x+0.5f, yb=p1.y+0.5f, zb=p1.z+0.5f; // end point
-	const int dx=(int)sign(xb-xa), dy=(int)sign(yb-ya), dz=(int)sign(zb-za); // fast ray-grid-traversal
-	const float fxa=xa-floor(xa), fya=ya-floor(ya), fza=za-floor(za);
-	int3 xyz = int3(floor(xa), floor(ya), floor(za));
-	const float tdx = fmin((float)dx/(xb-xa), 1E7f);
-	const float tdy = fmin((float)dy/(yb-ya), 1E7f);
-	const float tdz = fmin((float)dz/(zb-za), 1E7f);
-	float tmx = tdx*(dx>0 ? 1.0f-fxa : fxa);
-	float tmy = tdy*(dy>0 ? 1.0f-fya : fya);
-	float tmz = tdz*(dz>0 ? 1.0f-fza : fza);
-	while(true) {
-		if(tmx<tmy) {
-			if(tmx<tmz) { xyz.x += dx; tmx += tdx; } else { xyz.z += dz; tmz += tdz; }
-		} else {
-			if(tmy<tmz) { xyz.y += dy; tmy += tdy; } else { xyz.z += dz; tmz += tdz; }
-		}
-		if(tmx>1.000001f&&tmy>1.000001f&&tmz>1.000001f) return; // terminate at end of ray
-		if(xyz.x<0||xyz.y<0||xyz.z<0||xyz.x>=(int)lbm.get_Nx()||xyz.y>=(int)lbm.get_Ny()||xyz.z>=(int)lbm.get_Nz()) return; // terminate if out of box
-		const ulong n = lbm.index((uint)xyz.x, (uint)xyz.y, (uint)xyz.z);
-		lbm.flags[n] = flag;
-	}
-}
-void voxelize_triangle(LBM& lbm, const float3& p0, const float3& p1, const float3& p2, const uchar flag) { // voxelize triangle
-	const float3 u=p1-p0, v=p2-p1, w=p0-p2;
-	const float ul=length(u), vl=length(v), wl=length(w);
-	for(float i=0.0f; i<ul; i+=0.25f) voxelize_line(lbm, p0+(i/ul)*u, p2, flag);
-	for(float i=0.0f; i<vl; i+=0.25f) voxelize_line(lbm, p1+(i/vl)*v, p0, flag);
-	for(float i=0.0f; i<wl; i+=0.25f) voxelize_line(lbm, p2+(i/wl)*w, p1, flag);
-}
-void voxelize_mesh_hull(LBM& lbm, const Mesh* mesh, const uchar flag) { // voxelize mesh
-	for(uint i=0u; i<mesh->triangle_number; i++) voxelize_triangle(lbm, mesh->p0[i], mesh->p1[i], mesh->p2[i], flag); // voxelize mesh
-}
-void voxelize_stl_hull(LBM& lbm, const string& path, const float3& center, const float3x3& rotation, const float size, const uchar flag) { // read and voxelize binary .stl file
-	const Mesh* mesh = read_stl(path, float3(lbm.get_Nx(), lbm.get_Ny(), lbm.get_Nz()), center, rotation, size);
-	voxelize_mesh_hull(lbm, mesh, flag);
-	delete mesh;
-}
-void voxelize_stl_hull(LBM& lbm, const string& path, const float3x3& rotation, const float size, const uchar flag) { // read and voxelize binary .stl file (place in box center)
-	voxelize_stl_hull(lbm, path, lbm.center(), rotation, size, flag);
-}
-void voxelize_stl_hull(LBM& lbm, const string& path, const float3& center, const float size, const uchar flag) { // read and voxelize binary .stl file (no rotation)
-	voxelize_stl_hull(lbm, path, center, float3x3(1.0f), size, flag);
-}
-void voxelize_stl_hull(LBM& lbm, const string& path, const float size, const uchar flag) { // read and voxelize binary .stl file (place in box center, no rotation)
-	voxelize_stl_hull(lbm, path, lbm.center(), float3x3(1.0f), size, flag);
 }
\ No newline at end of file
diff --git a/src/shapes.hpp b/src/shapes.hpp
index 51dfdc9e..395128ef 100644
--- a/src/shapes.hpp
+++ b/src/shapes.hpp
@@ -22,14 +22,4 @@ float ellipsoid_plic(const uint x, const uint y, const uint z, const float3& p,
 float cylinder_x_plic(const uint x, const uint y, const uint z, const float3& p, const float r, const float l); // bar with PLIC fill levels returned
 float cylinder_y_plic(const uint x, const uint y, const uint z, const float3& p, const float r, const float l); // bar with PLIC fill levels returned
 float cylinder_z_plic(const uint x, const uint y, const uint z, const float3& p, const float r, const float l); // bar with PLIC fill levels returned
-float plane_plic(const uint x, const uint y, const uint z, const float3& p, const float3& n); // plane with PLIC fill levels returned
-
-class LBM; // forward-declare LBM class
-void voxelize_line(LBM& lbm, const float3& p0, const float3& p1, const uchar flag); // voxelize line
-void voxelize_triangle(LBM& lbm, const float3& p0, const float3& p1, const float3& p2, const uchar flag); // voxelize triangle
-void voxelize_mesh_hull(LBM& lbm, const Mesh* mesh, const uchar flag); // voxelize mesh
-
-void voxelize_stl_hull(LBM& lbm, const string& path, const float3& center, const float3x3& rotation, const float size=-1.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file
-void voxelize_stl_hull(LBM& lbm, const string& path, const float3x3& rotation, const float size=-1.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file (place in box center)
-void voxelize_stl_hull(LBM& lbm, const string& path, const float3& center, const float size=-1.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file (no rotation)
-void voxelize_stl_hull(LBM& lbm, const string& path, const float size=-1.0f, const uchar flag=TYPE_S); // read and voxelize binary .stl file (place in box center, no rotation)
\ No newline at end of file
+float plane_plic(const uint x, const uint y, const uint z, const float3& p, const float3& n); // plane with PLIC fill levels returned
\ No newline at end of file