Optimize GL emulation prepareClientAttributes. Fix issues where the slow path was not able to handle unaligned source data. Annotate some unsupported cases. Remove liveClientAttributes, which does not seem to be a win in profiles.

juj · juj · commit 4e3c1b3e862f · 2014-01-10T14:54:24.000+02:00
diff --git a/src/library_gl.js b/src/library_gl.js
@@ -3367,7 +3367,6 @@ var LibraryGL = {
     totalEnabledClientAttributes: 0,
     enabledClientAttributes: [0, 0],
     clientAttributes: [], // raw data, including possible unneeded ones
-    liveClientAttributes: [], // the ones actually alive in the current computation, sorted
     currentRenderer: null, // Caches the currently active FFP emulation renderer, so that it does not have to be re-looked up unless relevant state changes.
     modifiedClientAttributes: false,
     clientActiveTexture: 0,
@@ -3430,17 +3429,17 @@ var LibraryGL = {
       if (GLImmediate.currentRenderer) {
         return GLImmediate.currentRenderer;
       }
-      // return a renderer object given the liveClientAttributes
       // we maintain a cache of renderers, optimized to not generate garbage
-      var attributes = GLImmediate.liveClientAttributes;
       var cacheMap = GLImmediate.rendererCache;
       var temp;
       var keyView = cacheMap.getStaticKeyView().reset();
 
       // By attrib state:
       var enabledAttributesKey = 0;
-      for (var i = 0; i < attributes.length; i++) {
-        enabledAttributesKey |= 1 << attributes[i].name;
+      for (var i = 0; i < GLImmediate.MAX_TEXTURES+3; i++) {
+        if (GLImmediate.enabledClientAttributes[i]) {
+          enabledAttributesKey |= 1 << i;
+        }
       }
       keyView.next(enabledAttributesKey);
 
@@ -3471,7 +3470,13 @@ var LibraryGL = {
       var renderer = keyView.get();
       if (!renderer) {
 #if GL_DEBUG
-        Module.printErr('generating renderer for ' + JSON.stringify(attributes));
+        var liveClientAttributes = [];
+        for (var i = 0; i < GLImmediate.MAX_TEXTURES+3; i++) {
+          if (GLImmediate.enabledClientAttributes[i]) {
+            liveClientAttributes.push(clientAttributes[i]);
+          }
+        }
+        Module.printErr('generating renderer for ' + JSON.stringify(liveClientAttributes));
 #endif
         renderer = GLImmediate.createRenderer();
         GLImmediate.currentRenderer = renderer;
@@ -4077,96 +4082,113 @@ var LibraryGL = {
       // does not work for glBegin/End, where we generate renderer components dynamically and then
       // disable them ourselves, but it does help with glDrawElements/Arrays.
       if (!GLImmediate.modifiedClientAttributes) {
+#if GL_ASSERTIONS
+        if ((GLImmediate.stride & 3) != 0) {
+          Runtime.warnOnce('Warning: Rendering from client side vertex arrays where stride (' + GLImmediate.stride + ') is not a multiple of four! This is not currently supported!');
+        }
+#endif
         GLImmediate.vertexCounter = (GLImmediate.stride * count) / 4; // XXX assuming float
         return;
       }
       GLImmediate.modifiedClientAttributes = false;
 
-      var stride = 0, start;
-      var attributes = GLImmediate.liveClientAttributes;
-      attributes.length = 0;
-      for (var i = 0; i < GLImmediate.NUM_ATTRIBUTES; i++) {
-        if (GLImmediate.enabledClientAttributes[i]) attributes.push(GLImmediate.clientAttributes[i]);
-      }
-      attributes.sort(function(x, y) { return !x ? (!y ? 0 : 1) : (!y ? -1 : (x.pointer - y.pointer)) });
-      start = GL.currArrayBuffer ? 0 : attributes[0].pointer;
-      var multiStrides = false;
-      for (var i = 0; i < attributes.length; i++) {
-        var attribute = attributes[i];
-        if (!attribute) break;
-        if (stride != 0 && stride != attribute.stride) multiStrides = true;
-        if (attribute.stride) stride = attribute.stride;
+      // The role of prepareClientAttributes is to examine the set of client-side vertex attribute buffers
+      // that user code has submitted, and to prepare them to be uploaded to a VBO in GPU memory
+      // (since WebGL does not support client-side rendering, i.e. rendering from vertex data in CPU memory)
+      // User can submit vertex data generally in three different configurations:
+      // 1. Fully planar: all attributes are in their own separate tightly-packed arrays in CPU memory.
+      // 2. Fully interleaved: all attributes share a single array where data is interleaved something like (pos,uv,normal), (pos,uv,normal), ...
+      // 3. Complex hybrid: Multiple separate arrays that either are sparsely strided, and/or partially interleave vertex attributes.
+
+      // For simplicity, we support the case (2) as the fast case. For (1) and (3), we do a memory copy of the
+      // vertex data here to prepare a relayouted buffer that is of the structure in case (2). The reason
+      // for this is that it allows the emulation code to get away with using just one VBO buffer for rendering,
+      // and not have to maintain multiple ones. Therefore cases (1) and (3) will be very slow, and case (2) is fast.
+
+      // Detect which case we are in by using a quick heuristic by examining the strides of the buffers. If all the buffers have identical 
+      // stride, we assume we have case (2), otherwise we have something more complex.
+      var clientStartPointer = 0x7FFFFFFF;
+      var bytes = 0; // Total number of bytes taken up by a single vertex.
+      var minStride = 0x7FFFFFFF;
+      var maxStride = 0;
+      for (var i = 0; i < 3+GLImmediate.MAX_TEXTURES; i++) {
+        if (GLImmediate.enabledClientAttributes[i]) {
+          var attr = GLImmediate.clientAttributes[i];
+          clientStartPointer = Math.min(clientStartPointer, attr.pointer);
+          attr.sizeBytes = attr.size * GL.byteSizeByType[attr.type - GL.byteSizeByTypeRoot];
+          bytes += attr.sizeBytes;
+          minStride = Math.min(minStride, attr.stride);
+          maxStride = Math.max(maxStride, attr.stride);
+        }
       }
 
-      if (multiStrides) stride = 0; // we will need to restride
-      var bytes = 0; // total size in bytes
-      if (!stride && !beginEnd) {
-        // beginEnd can not have stride in the attributes, that is fine. otherwise,
-        // no stride means that all attributes are in fact packed. to keep the rest of
-        // our emulation code simple, we perform unpacking/restriding here. this adds overhead, so
-        // it is a good idea to not hit this!
-#if ASSERTIONS
-        Runtime.warnOnce('Unpacking/restriding attributes, this is slow and dangerous');
+      if ((minStride != maxStride || maxStride < bytes) && !beginEnd) {
+        // We are in cases (1) or (3): slow path, shuffle the data around into a single interleaved vertex buffer.
+        // The immediate-mode glBegin()/glEnd() vertex submission gets automatically generated in appropriate layout,
+        // so never need to come down this path if that was used.
+#if GL_ASSERTIONS
+        Runtime.warnOnce('Rendering from planar client-side vertex arrays. This is a very slow emulation path! Use interleaved vertex arrays for best performance.');
 #endif
         if (!GLImmediate.restrideBuffer) GLImmediate.restrideBuffer = _malloc(GL.MAX_TEMP_BUFFER_SIZE);
-        start = GLImmediate.restrideBuffer;
-#if ASSERTIONS
-        assert(start % 4 == 0);
-#endif
+        var start = GLImmediate.restrideBuffer;
+        bytes = 0;
         // calculate restrided offsets and total size
-        for (var i = 0; i < attributes.length; i++) {
-          var attribute = attributes[i];
-          if (!attribute) break;
-          var size = attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot];
-          if (size % 4 != 0) size += 4 - (size % 4); // align everything
-          attribute.offset = bytes;
-          bytes += size;
+        for (var i = 0; i < GLImmediate.clientAttributes.length; i++) {
+          if (GLImmediate.enabledClientAttributes[i]) {
+            var attribute = GLImmediate.clientAttributes[i];
+            var size = attribute.sizeBytes;
+            if (size % 4 != 0) size += 4 - (size % 4); // align everything
+            attribute.offset = bytes;
+            bytes += size;
+          }
         }
-#if ASSERTIONS
-        assert(count*bytes <= GL.MAX_TEMP_BUFFER_SIZE);
-#endif
-        // copy out the data (we need to know the stride for that, and define attribute.pointer
-        for (var i = 0; i < attributes.length; i++) {
-          var attribute = attributes[i];
-          if (!attribute) break;
-          var size4 = Math.floor((attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot])/4);
-          for (var j = 0; j < count; j++) {
-            for (var k = 0; k < size4; k++) { // copy in chunks of 4 bytes, our alignment makes this possible
-              HEAP32[((start + attribute.offset + bytes*j)>>2) + k] = HEAP32[(attribute.pointer>>2) + j*size4 + k];
+        // copy out the data (we need to know the stride for that, and define attribute.pointer)
+        for (var i = 0; i < GLImmediate.clientAttributes.length; i++) {
+          if (GLImmediate.enabledClientAttributes[i]) {
+            var attribute = GLImmediate.clientAttributes[i];
+            var srcStride = Math.max(attribute.sizeBytes, attribute.stride);
+            if ((srcStride & 3) == 0 && (attribute.sizeBytes & 3) == 0) {
+              var size4 = attribute.sizeBytes>>2;
+              var srcStride4 = Math.max(attribute.sizeBytes, attribute.stride)>>2;
+              for (var j = 0; j < count; j++) {
+                for (var k = 0; k < size4; k++) { // copy in chunks of 4 bytes, our alignment makes this possible
+                  HEAP32[((start + attribute.offset + bytes*j)>>2) + k] = HEAP32[(attribute.pointer>>2) + j*srcStride4 + k];
+                }
+              }
+            } else {
+              for (var j = 0; j < count; j++) {
+                for (var k = 0; k < attribute.sizeBytes; k++) { // source data was not aligned to multiples of 4, must copy byte by byte.
+                  HEAP8[start + attribute.offset + bytes*j + k] = HEAP8[attribute.pointer + j*srcStride + k];
+                }
+              }
             }
+            attribute.pointer = start + attribute.offset;
           }
-          attribute.pointer = start + attribute.offset;
         }
+        GLImmediate.stride = bytes;
+        GLImmediate.vertexPointer = start;
       } else {
-        // normal situation, everything is strided and in the same buffer
-        for (var i = 0; i < attributes.length; i++) {
-          var attribute = attributes[i];
-          if (!attribute) break;
-          attribute.offset = attribute.pointer - start;
-          if (attribute.offset > bytes) { // ensure we start where we should
-#if ASSERTIONS
-            assert((attribute.offset - bytes)%4 == 0); // XXX assuming 4-alignment
-#endif
-            bytes += attribute.offset - bytes;
-          }
-          bytes += attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot];
-          if (bytes % 4 != 0) bytes += 4 - (bytes % 4); // XXX assuming 4-alignment
+        // case (2): fast path, all data is interleaved to a single vertex array so we can get away with a single VBO upload.
+        if (GL.currArrayBuffer) {
+          GLImmediate.vertexPointer = 0;
+        } else {
+          GLImmediate.vertexPointer = clientStartPointer;
         }
-#if ASSERTIONS
-        assert(beginEnd || bytes <= stride); // if not begin-end, explicit stride should make sense with total byte size
-#endif
-        if (bytes < stride) { // ensure the size is that of the stride
-          bytes = stride;
+        for (var i = 0; i < 3+GLImmediate.MAX_TEXTURES; i++) {
+          if (GLImmediate.enabledClientAttributes[i]) {
+            var attr = GLImmediate.clientAttributes[i];
+            attr.offset = attr.pointer - clientStartPointer; // Compute what will be the offset of this attribute in the VBO after we upload.
+          }
         }
+        GLImmediate.stride = Math.max(maxStride, bytes);
       }
-      GLImmediate.stride = bytes;
-
       if (!beginEnd) {
-        bytes *= count;
-        if (!GL.currArrayBuffer) {
-          GLImmediate.vertexPointer = start;
+#if GL_ASSERTIONS
+        if ((GLImmediate.stride & 3) != 0) {
+          Runtime.warnOnce('Warning: Rendering from client side vertex arrays where stride (' + GLImmediate.stride + ') is not a multiple of four! This is not currently supported!');
         }
-        GLImmediate.vertexCounter = bytes / 4; // XXX assuming float
+#endif
+        GLImmediate.vertexCounter = (GLImmediate.stride * count) / 4; // XXX assuming float
       }
     },