diff --git a/AUTHORS b/AUTHORS
index 0a359cc6f03e3..30bafcdc1bb3a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -204,4 +204,5 @@ a license to everyone to use it as detailed in LICENSE.)
 * Kagami Hiiragi <kagami@genshiken.org>
 * Jan Bölsche <jan@lagomorph.de>
 * Sebastian Matthes <sebastianmatthes@outlook.com> (copyright owned by Volkswagen AG)
+* Robert Goulet <robert.goulet@autodesk.com> (copyright owned by Autodesk, Inc.)
 
diff --git a/embuilder.py b/embuilder.py
index 8e7280bbc9e4d..962dd8c697b3e 100755
--- a/embuilder.py
+++ b/embuilder.py
@@ -44,6 +44,8 @@
         vorbis
         zlib
 
+Issuing 'embuilder.py build ALL' causes each task to be built.
+
 It is also possible to build native_optimizer manually by using CMake. To
 do that, run
 
@@ -72,11 +74,14 @@ def build_port(port_name, lib_name, params):
     int main() {}
   ''', [os.path.join('ports-builds', port_name, lib_name)], params)
 
-
 operation = sys.argv[1]
 
 if operation == 'build':
-  for what in sys.argv[2:]:
+  tasks = sys.argv[2:]
+  if 'ALL' in tasks:
+    tasks = ['libc', 'libc-mt', 'dlmalloc', 'dlmalloc_threadsafe', 'pthreads', 'libcxx', 'libcxx_noexcept', 'libcxxabi', 'gl', 'struct_info', 'native_optimizer', 'bullet', 'freetype', 'libpng', 'ogg', 'sdl2', 'sdl2-image', 'vorbis', 'zlib']
+
+  for what in tasks:
     shared.logging.info('building and verifying ' + what)
     if what in ('libc', 'dlmalloc'):
       build('''
diff --git a/emcc b/emcc
index bfbd61a6d4cee..c0881f9c723e2 100755
--- a/emcc
+++ b/emcc
@@ -811,6 +811,13 @@ try:
     final_suffix = 'mout' # not bitcode, not js; but just dependency rule of the input file
   final_ending = ('.' + final_suffix) if len(final_suffix) > 0 else ''
 
+  # target is now finalized, can finalize other _target s
+  js_target = unsuffixed(target) + '.js'
+
+  if separate_asm:
+    asm_target = js_target[:-3] + '.asm.js'
+    shared.Settings.SEPARATE_ASM = asm_target
+
   # Find library files
   for i, lib in libs:
     logging.debug('looking for library "%s"', lib)
@@ -893,7 +900,6 @@ try:
     assert shared.Settings.UNALIGNED_MEMORY == 0, 'forced unaligned memory not supported in fastcomp'
     assert shared.Settings.FORCE_ALIGNED_MEMORY == 0, 'forced aligned memory is not supported in fastcomp'
     assert shared.Settings.PGO == 0, 'pgo not supported in fastcomp'
-    assert shared.Settings.USE_TYPED_ARRAYS == 2, 'altering USE_TYPED_ARRAYS is not supported'
     assert shared.Settings.QUANTUM_SIZE == 4, 'altering the QUANTUM_SIZE is not supported'
   except Exception, e:
     logging.error('Compiler settings are incompatible with fastcomp. You can fall back to the older compiler core, although that is not recommended, see http://kripken.github.io/emscripten-site/docs/building_from_source/LLVM-Backend.html')
@@ -1252,7 +1258,7 @@ try:
 
     if llvm_lto >= 2:
       logging.debug('running LLVM opts as pre-LTO')
-      final = shared.Building.llvm_opt(in_temp(target_basename + '.bc'), llvm_opts, DEFAULT_FINAL)
+      final = shared.Building.llvm_opt(final, llvm_opts, DEFAULT_FINAL)
       if DEBUG: save_intermediate('opt', 'bc')
 
     # If we can LTO, do it before dce, since it opens up dce opportunities
@@ -1594,8 +1600,6 @@ try:
     f.close()
     src = None
 
-  js_target = unsuffixed(target) + '.js'
-
   if shared.Settings.EMTERPRETIFY:
     flush_js_optimizer_queue()
     logging.debug('emterpretifying')
@@ -1862,7 +1866,6 @@ try {
 
     if separate_asm:
       un_src()
-      asm_target = js_target[:-3] + '.asm.js'
       temp_target = misc_temp_files.get(suffix='.js').name
       execute([shared.PYTHON, shared.path_from_root('tools', 'separate_asm.py'), js_target, asm_target, temp_target])
       shutil.move(temp_target, js_target)
diff --git a/emcmake b/emcmake
index e3baef0e88900..98bfe9210e2dd 100755
--- a/emcmake
+++ b/emcmake
@@ -4,10 +4,5 @@ import os, subprocess, sys
 from tools import shared
 
 configure_path = shared.path_from_root('emconfigure')
-node_js = shared.NODE_JS
-if type(node_js) is list: node_js = ' '.join(node_js)
-node_js = node_js.replace('"', '\"')
 
-exit(subprocess.call([shared.PYTHON, configure_path] + \
-                     [sys.argv[1]] + \
-                     ['-DCMAKE_CROSSCOMPILING_EMULATOR="' + node_js +'"'] + sys.argv[2:]))
+exit(subprocess.call([shared.PYTHON, configure_path] + sys.argv[1:]))
diff --git a/emconfigure b/emconfigure
index 9b29967ffb3db..c2c30d9216669 100755
--- a/emconfigure
+++ b/emconfigure
@@ -31,9 +31,14 @@ variables so that emcc etc. are used. Typical usage:
 (but you can run any command instead of configure)
 
 '''
+elif 'cmake' in sys.argv[1]:
+  node_js = shared.NODE_JS
+  if type(node_js) is list: node_js = ' '.join(node_js)
+  node_js = node_js.replace('"', '\"')
+  sys.argv = sys.argv[:2] + ['-DCMAKE_CROSSCOMPILING_EMULATOR="' + node_js +'"'] + sys.argv[2:]
 
 try:
-	shared.Building.configure(sys.argv[1:])
+  shared.Building.configure(sys.argv[1:])
 except CalledProcessError, e:
-	sys.exit(e.returncode)
+  sys.exit(e.returncode)
 
diff --git a/emscripten-version.txt b/emscripten-version.txt
index edec5febbb738..6861b8cf31789 100644
--- a/emscripten-version.txt
+++ b/emscripten-version.txt
@@ -1,2 +1,2 @@
-1.34.8
+1.34.9
 
diff --git a/emscripten.py b/emscripten.py
index a70d500ac9c08..2748f06d27b8a 100755
--- a/emscripten.py
+++ b/emscripten.py
@@ -286,7 +286,7 @@ def save_settings():
       all_args = ['code'] + args
       asm_const_funcs.append(r'''
 function _emscripten_asm_const_%d(%s) {
- return ASM_CONSTS[code](%s) | 0;
+ return ASM_CONSTS[code](%s);
 }''' % (arity, ', '.join(all_args), ', '.join(args)))
 
     pre = pre.replace('// === Body ===', '// === Body ===\n' + '\nvar ASM_CONSTS = [' + ',\n '.join(asm_consts) + '];\n' + '\n'.join(asm_const_funcs) + '\n')
@@ -530,7 +530,14 @@ def keyfunc(other):
              '"); ' + extra
 
     basic_funcs = ['abort', 'assert'] + [m.replace('.', '_') for m in math_envs]
-    if settings['SAFE_HEAP']: basic_funcs += ['SAFE_HEAP_LOAD', 'SAFE_HEAP_STORE', 'SAFE_FT_MASK']
+
+    asm_safe_heap = settings['SAFE_HEAP'] and not settings['SAFE_HEAP_LOG'] and not settings['RELOCATABLE'] # optimized safe heap in asm, when we can
+
+    if settings['SAFE_HEAP']:
+      if asm_safe_heap:
+        basic_funcs += ['segfault', 'alignfault', 'ftfault']
+      else:
+        basic_funcs += ['SAFE_HEAP_LOAD', 'SAFE_HEAP_LOAD_D', 'SAFE_HEAP_STORE', 'SAFE_HEAP_STORE_D', 'SAFE_FT_MASK']
     if settings['ASSERTIONS']:
       if settings['ASSERTIONS'] >= 2: import difflib
       for sig in last_forwarded_json['Functions']['tables'].iterkeys():
@@ -540,6 +547,8 @@ def keyfunc(other):
     basic_vars = ['STACKTOP', 'STACK_MAX', 'tempDoublePtr', 'ABORT']
     basic_float_vars = []
 
+    if settings['SAFE_HEAP']: basic_vars += ['DYNAMICTOP']
+
     if metadata.get('preciseI64MathUsed'):
       basic_vars += ['cttz_i8']
     else:
@@ -570,6 +579,9 @@ def keyfunc(other):
       if settings.get('EMTERPRETIFY_ASYNC'):
         asm_runtime_funcs += ['setAsyncState', 'emtStackSave', 'emtStackRestore']
 
+    if settings['SAFE_HEAP']:
+      asm_runtime_funcs += ['setDynamicTop']
+
     # function tables
     if not settings['EMULATED_FUNCTION_POINTERS']:
       function_tables = ['dynCall_' + table for table in last_forwarded_json['Functions']['tables']]
@@ -703,6 +715,9 @@ def string_contains_any(s, str_list):
       simd_float_symbols = ['  var SIMD_' + ty + '_' + g + '=SIMD_' + ty + access_quote(g) + ';\n' for ty in simdfloattypes for g in simdfloatfuncs]
       simd_float_symbols = filter(lambda x: not string_contains_any(x, nonexisting_simd_symbols), simd_float_symbols)
       asm_global_funcs += ''.join(simd_float_symbols)
+      # Unofficial, Bool64x2 does not yet exist, but needed for Float64x2 comparisons.
+      if metadata['simdFloat64x2']:
+        asm_global_funcs += '  var SIMD_Int32x4_fromBool64x2Bits = global.SIMD.Int32x4.fromBool64x2Bits;\n';
     if settings['USE_PTHREADS']:
 #      asm_global_funcs += ''.join(['  var Atomics_' + ty + '=global' + access_quote('Atomics') + access_quote(ty) + ';\n' for ty in ['load', 'store', 'exchange', 'compareExchange', 'add', 'sub', 'and', 'or', 'xor', 'fence']])
 # TODO: Once bug https://bugzilla.mozilla.org/show_bug.cgi?id=1141986 is implemented, replace the following line with the above one!
@@ -923,6 +938,82 @@ def string_contains_any(s, str_list):
   HEAP8[tempDoublePtr+6>>0] = HEAP8[ptr+6>>0];
   HEAP8[tempDoublePtr+7>>0] = HEAP8[ptr+7>>0];
 }
+'''] + ['' if not settings['SAFE_HEAP'] else '''
+function setDynamicTop(value) {
+  value = value | 0;
+  DYNAMICTOP = value;
+}
+'''] + ['' if not asm_safe_heap else '''
+function SAFE_HEAP_STORE(dest, value, bytes) {
+  dest = dest | 0;
+  value = value | 0;
+  bytes = bytes | 0;
+  if ((dest|0) <= 0) segfault();
+  if (((dest + bytes)|0) > (DYNAMICTOP|0)) segfault();
+  if ((bytes|0) == 4) {
+    if ((dest&3)) alignfault();
+    HEAP32[dest>>2] = value;
+  } else if ((bytes|0) == 1) {
+    HEAP8[dest>>0] = value;
+  } else {
+    if ((dest&1)) alignfault();
+    HEAP16[dest>>1] = value;
+  }
+}
+function SAFE_HEAP_STORE_D(dest, value, bytes) {
+  dest = dest | 0;
+  value = +value;
+  bytes = bytes | 0;
+  if ((dest|0) <= 0) segfault();
+  if (((dest + bytes)|0) > (DYNAMICTOP|0)) segfault();
+  if ((bytes|0) == 8) {
+    if ((dest&7)) alignfault();
+    HEAPF64[dest>>3] = value;
+  } else {
+    if ((dest&3)) alignfault();
+    HEAPF32[dest>>2] = value;
+  }
+}
+function SAFE_HEAP_LOAD(dest, bytes, unsigned) {
+  dest = dest | 0;
+  bytes = bytes | 0;
+  unsigned = unsigned | 0;
+  if ((dest|0) <= 0) segfault();
+  if ((dest + bytes|0) > (DYNAMICTOP|0)) segfault();
+  if ((bytes|0) == 4) {
+    if ((dest&3)) alignfault();
+    return HEAP32[dest>>2] | 0;
+  } else if ((bytes|0) == 1) {
+    if (unsigned) {
+      return HEAPU8[dest>>0] | 0;
+    } else {
+      return HEAP8[dest>>0] | 0;
+    }
+  }
+  if ((dest&1)) alignfault();
+  if (unsigned) return HEAPU16[dest>>1] | 0;
+  return HEAP16[dest>>1] | 0;
+}
+function SAFE_HEAP_LOAD_D(dest, bytes) {
+  dest = dest | 0;
+  bytes = bytes | 0;
+  if ((dest|0) <= 0) segfault();
+  if ((dest + bytes|0) > (DYNAMICTOP|0)) segfault();
+  if ((bytes|0) == 8) {
+    if ((dest&7)) alignfault();
+    return +HEAPF64[dest>>3];
+  }
+  if ((dest&3)) alignfault();
+  return +HEAPF32[dest>>2];
+}
+function SAFE_FT_MASK(value, mask) {
+  value = value | 0;
+  mask = mask | 0;
+  var ret = 0;
+  ret = value & mask;
+  if ((ret|0) != (value|0)) ftfault();
+  return ret | 0;
+}
 '''] + ['''
 function setTempRet0(value) {
   value = value|0;
@@ -951,6 +1042,11 @@ def string_contains_any(s, str_list):
 Runtime.stackRestore = asm['stackRestore'];
 Runtime.establishStackSpace = asm['establishStackSpace'];
 ''')
+      if settings['SAFE_HEAP']:
+        funcs_js.append('''
+Runtime.setDynamicTop = asm['setDynamicTop'];
+''')
+
     if not settings['RELOCATABLE']:
       funcs_js.append('''
 Runtime.setTempRet0 = asm['setTempRet0'];
diff --git a/site/source/docs/api_reference/preamble.js.rst b/site/source/docs/api_reference/preamble.js.rst
index fa7a8a177642a..654027bc464af 100644
--- a/site/source/docs/api_reference/preamble.js.rst
+++ b/site/source/docs/api_reference/preamble.js.rst
@@ -157,6 +157,24 @@ Conversion functions — strings, pointers and arrays
 	:rtype: String
 
 
+.. js:function:: UTF8ToString(ptr)
+
+	Given a pointer ``ptr`` to a null-terminated UTF8-encoded string in the Emscripten HEAP, returns a copy of that string as a JavaScript ``String`` object.
+
+	:param ptr: A pointer to a null-terminated UTF8-encoded string in the Emscripten HEAP.
+	:returns: A JavaScript ``String`` object
+	
+
+
+.. js:function:: stringToUTF8(str, outPtr[, maxBytesToWrite])
+
+	Copies the given JavaScript ``String`` object ``str`` to the Emscripten HEAP at address ``outPtr``, null-terminated and encoded in UTF8 form.
+
+	:param str: A JavaScript ``String`` object.
+	:type str: String
+	:param outPtr: Pointer to data copied from ``str``, encoded in UTF8 format and null-terminated.
+	:param maxBytesToWrite: A limit on the number of bytes to write out.
+
 
 .. js:function:: UTF16ToString(ptr)
 
@@ -167,7 +185,7 @@ Conversion functions — strings, pointers and arrays
 	
 
 
-.. js:function:: stringToUTF16(str, outPtr)
+.. js:function:: stringToUTF16(str, outPtr[, maxBytesToWrite])
 
 	Copies the given JavaScript ``String`` object ``str`` to the Emscripten HEAP at address ``outPtr``, null-terminated and encoded in UTF16LE form. 
 	
@@ -176,6 +194,7 @@ Conversion functions — strings, pointers and arrays
 	:param str: A JavaScript ``String`` object.
 	:type str: String
 	:param outPtr: Pointer to data copied from ``str``, encoded in UTF16LE format and null-terminated.
+	:param maxBytesToWrite: A limit on the number of bytes to write out.
 
 
 
@@ -187,7 +206,7 @@ Conversion functions — strings, pointers and arrays
 	:returns: A JavaScript ``String`` object.
 	
 
-.. js:function:: stringToUTF32(str, outPtr)
+.. js:function:: stringToUTF32(str, outPtr[, maxBytesToWrite])
 
 	Copies the given JavaScript ``String`` object ``str`` to the Emscripten HEAP at address ``outPtr``, null-terminated and encoded in UTF32LE form. 
 	
@@ -196,6 +215,7 @@ Conversion functions — strings, pointers and arrays
 	:param str: A JavaScript ``String`` object.
 	:type str: String
 	:param outPtr: Pointer to data copied from ``str``, encoded in encoded in UTF32LE format and null-terminated.
+	:param maxBytesToWrite: A limit on the number of bytes to write out.
 
 
 
diff --git a/site/source/docs/getting_started/FAQ.rst b/site/source/docs/getting_started/FAQ.rst
index 16fa48c0666b6..ce71499f2bbe1 100644
--- a/site/source/docs/getting_started/FAQ.rst
+++ b/site/source/docs/getting_started/FAQ.rst
@@ -255,6 +255,10 @@ To make sure a C function remains available to be called from normal JavaScript,
 
 	./emcc -s EXPORTED_FUNCTIONS="['_main', '_my_func']"  ...
 
+.. note:: 
+
+   `_main` should be in the export list, as in that example, if you have a `main()` function. Otherwise, it will be removed as dead code; there is no special logic to keep `main()` alive by default.
+
 .. note:: 
 
    `EXPORTED_FUNCTIONS` affects compilation to JavaScript. If you first compile to an object file,
diff --git a/site/source/docs/optimizing/Optimizing-Code.rst b/site/source/docs/optimizing/Optimizing-Code.rst
index 13f6e40b4d4fc..e565546e7ef05 100644
--- a/site/source/docs/optimizing/Optimizing-Code.rst
+++ b/site/source/docs/optimizing/Optimizing-Code.rst
@@ -119,6 +119,12 @@ You can also do this manually, as follows:
 
 .. _optimizing-code-outlining:
 
+Running by itself
+-----------------
+
+If you hit memory limits in browsers, it can help to run your project by itself, as opposed to inside a web page containing other content. If you open a new web page (as a new tab, or a new window) that contains just your project, then you have the best chance at avoiding memory framentation issues.
+
+
 Outlining
 ---------
 
diff --git a/site/source/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.rst b/site/source/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.rst
index e12f4a6f30dc5..bbd7ea2c2d223 100644
--- a/site/source/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.rst
+++ b/site/source/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.rst
@@ -119,8 +119,13 @@ parameters to pass to the function:
 
      - Exporting is done at compile time. For example:
        ``-s EXPORTED_FUNCTIONS='["_main","_other_function"]'`` exports
-       ``main()`` and ``other_function()``. You need ``_`` at the
+       ``main()`` and ``other_function()``.
+     - Note that you need ``_`` at the
        beginning of the function names in the ``EXPORTED_FUNCTIONS`` list.
+     - Note that ``_main`` is mentioned in that list. If you don't have it there,
+       the compiler will eliminate it as dead code. The list of exported
+       functions is the **entire** list that will be kept alive (unless other
+       code was kept alive in another manner).
      - Emscripten does :ref:`dead code elimination <faq-dead-code-elimination>`
        to minimize code size — exporting ensures the functions you need
        aren't removed.
diff --git a/site/source/docs/porting/pthreads.rst b/site/source/docs/porting/pthreads.rst
index cade0eeaeb95f..56bcab258d92e 100644
--- a/site/source/docs/porting/pthreads.rst
+++ b/site/source/docs/porting/pthreads.rst
@@ -26,7 +26,7 @@ The Emscripten implementation for the pthreads API should follow the POSIX stand
 
 - When -s PTHREAD_POOL_SIZE=<integer> is not specified and pthread_create() is called, the new thread will not actually start to run immediately, but the main JS thread must yield execution back to browser first. This behavior is a result of `#1049079 <https://bugzilla.mozilla.org/show_bug.cgi?id=1049079>`.
 
-- Currently several of the functions in the C runtime, such as filesystem functions like fopen(), fread(), printf(), fprintf() etc. are not multithreaded, but instead their execution is proxied over to the main application thread. Memory allocation via malloc() and free() is fully multithreaded though.
+- Currently several of the functions in the C runtime, such as filesystem functions like fopen(), fread(), printf(), fprintf() etc. are not multithreaded, but instead their execution is proxied over to the main application thread. Memory allocation via malloc() and free() is fully multithreaded though. This proxying can generate a deadlock in a special situation that native code running pthreads does not have. See `bug 3495 <https://github.com/kripken/emscripten/issues/3495>` for more information and how to work around this until proxying is no longer needed in Emscripten.
 
 - The Emscripten implementation does not support `POSIX signals <http://man7.org/linux/man-pages/man7/signal.7.html>`, which are sometimes used in conjunction with pthreads. This is because it is not possible to send signals to web workers and pre-empt their execution. The only exception to this is pthread_kill() which can be used as normal to forcibly terminate a running thread.
 
diff --git a/src/deps_info.json b/src/deps_info.json
index 95d2452dfb13a..36f3872dc9593 100644
--- a/src/deps_info.json
+++ b/src/deps_info.json
@@ -39,6 +39,7 @@
   "glfwSleep": ["sleep"],
   "bind": ["htonl", "htons", "ntohs"],
   "connect": ["htonl", "htons", "ntohs"],
+  "socket": ["htonl", "htons", "ntohs"],
   "sleep": ["usleep"]
 }
 
diff --git a/src/ecmascript_simd.js b/src/ecmascript_simd.js
index 62f9b84de38c8..554d222113564 100644
--- a/src/ecmascript_simd.js
+++ b/src/ecmascript_simd.js
@@ -117,7 +117,15 @@ function simdCheckLaneIndex(index, lanes) {
 var lanes = [];
 
 function simdCreate(type) {
-  return type.fn.apply(type.fn, lanes);
+  // XXX Emscripten:
+  // Work around v8 NaN canonicalization issue: if lanes contains floats with non-canonical NaN bit patterns,
+  // type.fn.apply() will canonicalize the NaNs and the bits are lost (most likely as part of float->double expansion).
+  // Directly passing the arguments into the function preserves them.
+  if (type.name == "Float32x4") {
+    return SIMD.Float32x4(lanes[0], lanes[1], lanes[2], lanes[3]);
+  } else {
+    return type.fn.apply(type.fn, lanes);
+  }
 }
 
 function simdToString(type, a) {
@@ -461,7 +469,14 @@ if (typeof SIMD.Float32x4 === "undefined" ||
     if (!(this instanceof SIMD.Float32x4)) {
       return new SIMD.Float32x4(s0, s1, s2, s3);
     }
-    this.s_ = convertArray(_f32x4, [s0, s1, s2, s3]);
+    // XXX Emscripten:
+    // Don't use convertArray() here to construct the Float32x4, since v8 most likely due to float->double
+    // expansion will lose noncanonical NaN bits if present, producing an incorrect bit pattern as a result.
+    this.s_ = new Float32Array(new ArrayBuffer(16));
+    this.s_[0] = s0;
+    this.s_[1] = s1;
+    this.s_[2] = s2;
+    this.s_[3] = s3;
   }
 
   SIMD.Float32x4.extractLane = function(v, i) {
@@ -809,6 +824,11 @@ var allTypes = [float64x2, float32x4,
                 uint32x4, uint16x8, uint8x16,
                 bool32x4, bool16x8, bool8x16];
 
+// XXX Emscripten: Add member functions to Bool64x2 as well.
+allTypes.push(bool64x2);
+// XXX Emscripten: Float64x2 value conversion to other types (In two lowest channels. Two highest channels zero).
+float64x2.from = [int32x4, uint32x4, float32x4];
+
 // SIMD prototype functions.
 var prototypeFns = {
   valueOf:
@@ -1320,3 +1340,93 @@ if (typeof SIMD.Uint8x16.shuffle === "undefined") {
       typeof global === 'object')
      ? global
      : this);
+
+// XXX Emscripten-specific below XXX
+
+// Work around Firefox Nightly bug that Float64x2 comparison return a Int32x4 instead of a Bool64x2.
+try {
+  if (SIMD.Int32x4.check(SIMD.Float64x2.equal(SIMD.Float64x2.splat(5.0), SIMD.Float64x2.splat(5.0)))) {
+    SIMD.Float64x2.prevEqual = SIMD.Float64x2.equal;
+    SIMD.Float64x2.equal = function(a, b) {
+      var int32x4 = SIMD.Float64x2.prevEqual(a, b);
+      return SIMD.Bool64x2(SIMD.Int32x4.extractLane(int32x4, 1) != 0, SIMD.Int32x4.extractLane(int32x4, 3) != 0);
+    }
+    console.error('Warning: Patching up SIMD.Float64x2.equal to return a Bool64x2 instead of Int32x4!');
+  }
+} catch(e) {}
+try {
+  if (SIMD.Int32x4.check(SIMD.Float64x2.notEqual(SIMD.Float64x2.splat(5.0), SIMD.Float64x2.splat(5.0)))) {
+    SIMD.Float64x2.prevNotEqual = SIMD.Float64x2.notEqual;
+    SIMD.Float64x2.notEqual = function(a, b) {
+      var int32x4 = SIMD.Float64x2.prevNotEqual(a, b);
+      return SIMD.Bool64x2(SIMD.Int32x4.extractLane(int32x4, 1) != 0, SIMD.Int32x4.extractLane(int32x4, 3) != 0);
+    } 
+    console.error('Warning: Patching up SIMD.Float64x2.notEqual to return a Bool64x2 instead of Int32x4!');
+  }
+} catch(e) {}
+try {
+  if (SIMD.Int32x4.check(SIMD.Float64x2.greaterThan(SIMD.Float64x2.splat(5.0), SIMD.Float64x2.splat(5.0)))) {
+    SIMD.Float64x2.prevGreaterThan = SIMD.Float64x2.greaterThan;
+    SIMD.Float64x2.greaterThan = function(a, b) {
+      var int32x4 = SIMD.Float64x2.prevGreaterThan(a, b);
+      return SIMD.Bool64x2(SIMD.Int32x4.extractLane(int32x4, 1) != 0, SIMD.Int32x4.extractLane(int32x4, 3) != 0);
+    } 
+    console.error('Warning: Patching up SIMD.Float64x2.greaterThan to return a Bool64x2 instead of Int32x4!');
+  }
+} catch(e) {}
+try {
+  if (SIMD.Int32x4.check(SIMD.Float64x2.greaterThanOrEqual(SIMD.Float64x2.splat(5.0), SIMD.Float64x2.splat(5.0)))) {
+    SIMD.Float64x2.prevGreaterThanOrEqual = SIMD.Float64x2.greaterThanOrEqual;
+    SIMD.Float64x2.greaterThanOrEqual = function(a, b) {
+      var int32x4 = SIMD.Float64x2.prevGreaterThanOrEqual(a, b);
+      return SIMD.Bool64x2(SIMD.Int32x4.extractLane(int32x4, 1) != 0, SIMD.Int32x4.extractLane(int32x4, 3) != 0);
+    } 
+    console.error('Warning: Patching up SIMD.Float64x2.greaterThanOrEqual to return a Bool64x2 instead of Int32x4!');
+  }
+} catch(e) {}
+try {
+  if (SIMD.Int32x4.check(SIMD.Float64x2.lessThan(SIMD.Float64x2.splat(5.0), SIMD.Float64x2.splat(5.0)))) {
+    SIMD.Float64x2.prevLessThan = SIMD.Float64x2.lessThan;
+    SIMD.Float64x2.lessThan = function(a, b) {
+      var int32x4 = SIMD.Float64x2.prevLessThan(a, b);
+      return SIMD.Bool64x2(SIMD.Int32x4.extractLane(int32x4, 1) != 0, SIMD.Int32x4.extractLane(int32x4, 3) != 0);
+    } 
+    console.error('Warning: Patching up SIMD.Float64x2.lessThan to return a Bool64x2 instead of Int32x4!');
+  }
+} catch(e) {}
+try {
+  if (SIMD.Int32x4.check(SIMD.Float64x2.lessThanOrEqual(SIMD.Float64x2.splat(5.0), SIMD.Float64x2.splat(5.0)))) {
+    SIMD.Float64x2.prevLessThanOrEqual = SIMD.Float64x2.lessThanOrEqual;
+    SIMD.Float64x2.lessThanOrEqual = function(a, b) {
+      var int32x4 = SIMD.Float64x2.prevLessThanOrEqual(a, b);
+      return SIMD.Bool64x2(SIMD.Int32x4.extractLane(int32x4, 1) != 0, SIMD.Int32x4.extractLane(int32x4, 3) != 0);
+    } 
+    console.error('Warning: Patching up SIMD.Float64x2.lessThanOrEqual to return a Bool64x2 instead of Int32x4!');
+  }
+} catch(e) {}
+
+
+if (!SIMD.Int32x4.fromBool64x2Bits) {
+  SIMD.Int32x4.fromBool64x2Bits = function(bool64x2) {
+    var lane0 = SIMD.Bool64x2.extractLane(bool64x2, 0)?-1:0;
+    var lane1 = SIMD.Bool64x2.extractLane(bool64x2, 1)?-1:0;
+    return SIMD.Int32x4(lane0, lane0, lane1, lane1);
+  }
+}
+
+// TODO: Remove and replace with shiftRightScalar once https://bugzilla.mozilla.org/show_bug.cgi?id=1201934 lands.
+if (!SIMD.Int8x16.shiftRightLogicalByScalar) {
+  SIMD.Int8x16.shiftRightLogicalByScalar = function(s, v) {
+    return SIMD.Int8x16.fromUint8x16Bits(SIMD.Uint8x16.shiftRightLogicalByScalar(SIMD.Uint8x16.fromInt8x16Bits(s), v));
+  }
+}
+if (!SIMD.Int16x8.shiftRightLogicalByScalar) {
+  SIMD.Int16x8.shiftRightLogicalByScalar = function(s, v) {
+    return SIMD.Int16x8.fromUint16x8Bits(SIMD.Uint16x8.shiftRightLogicalByScalar(SIMD.Uint16x8.fromInt16x8Bits(s), v));
+  }
+}
+if (!SIMD.Int32x4.shiftRightLogicalByScalar) {
+  SIMD.Int32x4.shiftRightLogicalByScalar = function(s, v) {
+    return SIMD.Int32x4.fromUint32x4Bits(SIMD.Uint32x4.shiftRightLogicalByScalar(SIMD.Uint32x4.fromInt32x4Bits(s), v));
+  }
+}
diff --git a/src/jsifier.js b/src/jsifier.js
index 3e9c10f2063bc..e35e2865b7775 100644
--- a/src/jsifier.js
+++ b/src/jsifier.js
@@ -130,8 +130,10 @@ function JSify(data, functionsOnly) {
       if ((!LibraryManager.library.hasOwnProperty(ident) && !LibraryManager.library.hasOwnProperty(ident + '__inline')) || SIDE_MODULE) {
         if (notDep) {
           if (VERBOSE || ident.substr(0, 11) !== 'emscripten_') { // avoid warning on emscripten_* functions which are for internal usage anyhow
-            if (ERROR_ON_UNDEFINED_SYMBOLS) error('unresolved symbol: ' + ident);
-            else if (VERBOSE || (WARN_ON_UNDEFINED_SYMBOLS && !LINKABLE)) warn('unresolved symbol: ' + ident);
+            if (!LINKABLE) {
+              if (ERROR_ON_UNDEFINED_SYMBOLS) error('unresolved symbol: ' + ident);
+              else if (VERBOSE || WARN_ON_UNDEFINED_SYMBOLS) warn('unresolved symbol: ' + ident);
+            }
           }
         }
         if (!(MAIN_MODULE || SIDE_MODULE)) {
diff --git a/src/library.js b/src/library.js
index db8a8a0405412..051487dcb483d 100644
--- a/src/library.js
+++ b/src/library.js
@@ -3335,7 +3335,7 @@ LibraryManager.library = {
     return 0;
   },
 
-  getaddrinfo__deps: ['$Sockets', '$DNS', '_inet_pton4_raw', '_inet_ntop4_raw', '_inet_pton6_raw', '_inet_ntop6_raw', '_write_sockaddr', 'htonl'],
+  getaddrinfo__deps: ['$Sockets', '$DNS', '_inet_pton4_raw', '_inet_ntop4_raw', '_inet_pton6_raw', '_inet_ntop6_raw', '_write_sockaddr'],
   getaddrinfo: function(node, service, hint, out) {
     // Note getaddrinfo currently only returns a single addrinfo with ai_next defaulting to NULL. When NULL
     // hints are specified or ai_family set to AF_UNSPEC or ai_socktype or ai_protocol set to 0 then we
diff --git a/src/library_gl.js b/src/library_gl.js
index 94a54c0bf8ddc..519f5143404b3 100644
--- a/src/library_gl.js
+++ b/src/library_gl.js
@@ -475,6 +475,9 @@ var LibraryGL = {
           sizePerPixel = numChannels*1;
           break;
         case 0x1403 /* GL_UNSIGNED_SHORT */:
+#if USE_WEBGL2
+        case 0x140B /* GL_HALF_FLOAT */:
+#endif
         case 0x8D61 /* GL_HALF_FLOAT_OES */:
           sizePerPixel = numChannels*2;
           break;
@@ -482,7 +485,7 @@ var LibraryGL = {
         case 0x1406 /* GL_FLOAT */:
           sizePerPixel = numChannels*4;
           break;
-        case 0x84FA /* UNSIGNED_INT_24_8_WEBGL */:
+        case 0x84FA /* UNSIGNED_INT_24_8_WEBGL/UNSIGNED_INT_24_8 */:
           sizePerPixel = 4;
           break;
         case 0x8363 /* GL_UNSIGNED_SHORT_5_6_5 */:
diff --git a/src/library_glfw.js b/src/library_glfw.js
index c920dfd3e32a9..afa9f1029fbde 100644
--- a/src/library_glfw.js
+++ b/src/library_glfw.js
@@ -627,6 +627,9 @@ var LibraryGLFW = {
         Module.ctx = Browser.createContext(Module['canvas'], true, true, contextAttributes);
       }
 
+      // If context creation failed, do not return a valid window
+      if (!Module.ctx) return 0;
+
       // Get non alive id
       var win = new GLFW.Window(id, width, height, title, monitor, share);
 
diff --git a/src/library_syscall.js b/src/library_syscall.js
index 6e10d73cb3d8f..539f73d242c7d 100644
--- a/src/library_syscall.js
+++ b/src/library_syscall.js
@@ -446,6 +446,12 @@ var SyscallsLibrary = {
         }
         return newsock.stream.fd;
       }
+      case 6: { // getsockname
+        var sock = SYSCALLS.getSocketFromFD(), addr = SYSCALLS.get(), addrlen = SYSCALLS.get();
+        var res = __write_sockaddr(addr, sock.family, DNS.lookup_name(sock.daddr || '0.0.0.0'), sock.dport);
+        assert(!res.errno);
+        return 0;
+      }
       case 11: { // sendto
         var sock = SYSCALLS.getSocketFromFD(), message = SYSCALLS.get(), length = SYSCALLS.get(), flags = SYSCALLS.get(), dest = SYSCALLS.getSocketAddress(true);
         var slab = {{{ makeGetSlabs('message', 'i8', true) }}};
diff --git a/src/mini-lz4.js b/src/mini-lz4.js
index 6cbb16b23cd59..67d0ed8506441 100644
--- a/src/mini-lz4.js
+++ b/src/mini-lz4.js
@@ -113,6 +113,10 @@ var
 
 ,	hasher 			= /* XXX uint32( */ 2654435761 /* ) */
 
+assert(hashShift === 16);
+var hashTable = new Int16Array(1<<16);
+var empty = new Int16Array(hashTable.length);
+
 // CompressBound returns the maximum length of a lz4 block, given it's uncompressed length
 exports.compressBound = function (isize) {
 	return isize > maxInputSize
@@ -121,15 +125,11 @@ exports.compressBound = function (isize) {
 }
 
 exports.compress = function (src, dst, sIdx, eIdx) {
-	// V8 optimization: non sparse array with integers
-	var hashTable = new Array(hashSize)
-	for (var i = 0; i < hashSize; i++) {
-		hashTable[i] = 0
-	}
-	return compressBlock(src, dst, 0, hashTable, sIdx || 0, eIdx || dst.length)
+	hashTable.set(empty);
+	return compressBlock(src, dst, 0, sIdx || 0, eIdx || dst.length)
 }
 
-function compressBlock (src, dst, pos, hashTable, sIdx, eIdx) {
+function compressBlock (src, dst, pos, sIdx, eIdx) {
 	// XXX var Hash = uint32() // Reusable unsigned 32 bits integer
 	var dpos = sIdx
 	var dlen = eIdx - sIdx
@@ -333,6 +333,8 @@ exports.compressPackage = function(data, verify) {
   return compressedData;
 };
 
+assert(exports.CHUNK_SIZE < (1 << 15)); // we use 16-bit ints as the type of the hash table, chunk size must be smaller
+
 return exports;
 
 })();
diff --git a/src/parseTools.js b/src/parseTools.js
index 840c1e384b810..a412fde40e9d0 100644
--- a/src/parseTools.js
+++ b/src/parseTools.js
@@ -883,7 +883,7 @@ function makeGetValue(ptr, pos, type, noNeedFirst, unsigned, ignore, align, noSa
     if (printType !== 'null' && printType[0] !== '#') printType = '"' + safeQuote(printType) + '"';
     if (printType[0] === '#') printType = printType.substr(1);
     if (!ignore) {
-      return asmCoercion('SAFE_HEAP_LOAD(' + asmCoercion(offset, 'i32') + ', ' + Runtime.getNativeTypeSize(type) + ', ' + ((type in Compiletime.FLOAT_TYPES)|0) + ', ' + (!!unsigned+0) + ')', type);
+      return asmCoercion('SAFE_HEAP_LOAD' + ((type in Compiletime.FLOAT_TYPES) ? '_D' : '') + '(' + asmCoercion(offset, 'i32') + ', ' + Runtime.getNativeTypeSize(type) + ', ' + (!!unsigned+0) + ')', type);
     }
   }
   var ret = makeGetSlabs(ptr, type, false, unsigned)[0] + '[' + getHeapOffset(offset, type) + ']';
@@ -971,7 +971,7 @@ function makeSetValue(ptr, pos, value, type, noNeedFirst, ignore, align, noSafe,
     if (printType !== 'null' && printType[0] !== '#') printType = '"' + safeQuote(printType) + '"';
     if (printType[0] === '#') printType = printType.substr(1);
     if (!ignore) {
-      return asmCoercion('SAFE_HEAP_STORE(' + asmCoercion(offset, 'i32') + ', ' + asmCoercion(value, type) + ', ' + Runtime.getNativeTypeSize(type) + ', ' + ((type in Compiletime.FLOAT_TYPES)|0) + ')', type);
+      return 'SAFE_HEAP_STORE' + ((type in Compiletime.FLOAT_TYPES) ? '_D' : '') + '(' + asmCoercion(offset, 'i32') + ', ' + asmCoercion(value, type) + ', ' + Runtime.getNativeTypeSize(type) + ')';
     }
   }
   return makeGetSlabs(ptr, type, true).map(function(slab) { return slab + '[' + getHeapOffset(offset, type) + ']=' + value }).join(sep);
diff --git a/src/preamble.js b/src/preamble.js
index 387adde65cef0..6e45cf05a0bdf 100644
--- a/src/preamble.js
+++ b/src/preamble.js
@@ -54,8 +54,11 @@ function SAFE_HEAP_STORE(dest, value, bytes, isFloat) {
   assert(DYNAMICTOP <= TOTAL_MEMORY);
   setValue(dest, value, getSafeHeapType(bytes, isFloat), 1);
 }
+function SAFE_HEAP_STORE_D(dest, value, bytes) {
+  SAFE_HEAP_STORE(dest, value, bytes, true);
+}
 
-function SAFE_HEAP_LOAD(dest, bytes, isFloat, unsigned) {
+function SAFE_HEAP_LOAD(dest, bytes, unsigned, isFloat) {
   if (dest <= 0) abort('segmentation fault loading ' + bytes + ' bytes from address ' + dest);
   if (dest % bytes !== 0) abort('alignment error loading from address ' + dest + ', which was expected to be aligned to a multiple of ' + bytes);
   if (dest + bytes > Math.max(DYNAMICTOP, STATICTOP)) abort('segmentation fault, exceeded the top of the available heap when loading ' + bytes + ' bytes from address ' + dest + '. STATICTOP=' + STATICTOP + ', DYNAMICTOP=' + DYNAMICTOP);
@@ -68,6 +71,9 @@ function SAFE_HEAP_LOAD(dest, bytes, isFloat, unsigned) {
 #endif
   return ret;
 }
+function SAFE_HEAP_LOAD_D(dest, bytes, unsigned) {
+  return SAFE_HEAP_LOAD(dest, bytes, unsigned, true);
+}
 
 function SAFE_FT_MASK(value, mask) {
   var ret = value & mask;
@@ -76,6 +82,16 @@ function SAFE_FT_MASK(value, mask) {
   }
   return ret;
 }
+
+function segfault() {
+  abort('segmentation fault');
+}
+function alignfault() {
+  abort('alignment fault');
+}
+function ftfault() {
+  abort('Function table mask error');
+}
 #endif
 
 //========================================
@@ -1012,7 +1028,12 @@ var STACK_BASE = 0, STACKTOP = 0, STACK_MAX = 0; // stack area
 var DYNAMIC_BASE = 0, DYNAMICTOP = 0; // dynamic area handled by sbrk
 
 #if USE_PTHREADS
-if (ENVIRONMENT_IS_PTHREAD) staticSealed = true; // The static memory area has been initialized already in the main thread, pthreads skip this.
+if (ENVIRONMENT_IS_PTHREAD) {
+  staticSealed = true; // The static memory area has been initialized already in the main thread, pthreads skip this.
+#if SEPARATE_ASM != 0
+  importScripts('{{{ SEPARATE_ASM }}}'); // load the separated-out asm.js
+#endif
+}
 #endif
 
 function enlargeMemory() {
@@ -1130,7 +1151,9 @@ while (totalMemory < TOTAL_MEMORY || totalMemory < 2*TOTAL_STACK) {
 totalMemory = Math.max(totalMemory, 16*1024*1024);
 #endif
 if (totalMemory !== TOTAL_MEMORY) {
+#if ASSERTIONS
   Module.printErr('increasing TOTAL_MEMORY to ' + totalMemory + ' to be compliant with the asm.js spec (and given that TOTAL_STACK=' + TOTAL_STACK + ')');
+#endif
   TOTAL_MEMORY = totalMemory;
 }
 
diff --git a/src/runtime.js b/src/runtime.js
index 0b0f7cc0844ec..7d2c669b2e250 100644
--- a/src/runtime.js
+++ b/src/runtime.js
@@ -64,7 +64,10 @@ var RuntimeGenerator = {
     if (typeof ENVIRONMENT_IS_PTHREAD !== 'undefined' && ENVIRONMENT_IS_PTHREAD) throw 'Runtime.dynamicAlloc is not available in pthreads!'; // This is because each worker has its own copy of DYNAMICTOP, of which main thread is authoritative.
 #endif
     var ret = RuntimeGenerator.alloc(size, 'DYNAMIC');
-    ret += '; if (DYNAMICTOP >= TOTAL_MEMORY) { var success = enlargeMemory(); if (!success) { DYNAMICTOP = ret; return 0; } }'
+    if (SAFE_HEAP) ret += '; if (asm) { Runtime.setDynamicTop(DYNAMICTOP); }';
+    ret += '; if (DYNAMICTOP >= TOTAL_MEMORY) { var success = enlargeMemory(); if (!success) { DYNAMICTOP = ret; ';
+    if (SAFE_HEAP) ret += 'if (asm) { Runtime.setDynamicTop(DYNAMICTOP); }';
+    ret += ' return 0; } }'
     return ret;
   },
 
diff --git a/src/settings.js b/src/settings.js
index d59637d273469..2ba61cc14b8f1 100644
--- a/src/settings.js
+++ b/src/settings.js
@@ -61,9 +61,6 @@ var GLOBAL_BASE = -1; // where global data begins; the start of static memory. -
                       // default, any other value will be used as an override
 
 // Code embetterments
-var USE_TYPED_ARRAYS = 2; // Use typed arrays for the heap. See https://github.com/kripken/emscripten/wiki/Code-Generation-Modes/
-                          // 2 is a single heap, accessible through views as int8, int32, etc. This is
-                          //   the only supported mode.
 var DOUBLE_MODE = 1; // How to load and store 64-bit doubles.
                      // A potential risk is that doubles may be only 32-bit aligned. Forcing 64-bit alignment
                      // in Clang itself should be able to solve that, or as a workaround in DOUBLE_MODE 1 we
@@ -86,7 +83,7 @@ var FORCE_ALIGNED_MEMORY = 0; // If enabled, assumes all reads and writes are fu
                               // smaller and faster code, or even the option to turn this flag on.
 var WARN_UNALIGNED = 0; // Warn at compile time about instructions that LLVM tells us are not fully aligned.
                         // This is useful to find places in your code where you might refactor to ensure proper
-                        // alignment. (this option is fastcomp-only)
+                        // alignment.
 var PRECISE_I64_MATH = 1; // If enabled, i64 addition etc. is emulated - which is slow but precise. If disabled,
                           // we use the 'double trick' which is fast but incurs rounding at high values.
                           // If set to 2, we always include the i64 math code, which is necessary in the case
@@ -116,8 +113,6 @@ var SIMD = 0; // Whether to allow autovectorized SIMD code ( https://github.com/
               // also want the autovectorizer to run.
               // Note that SIMD support in browsers is not yet there (as of Sep 2, 2014), so you will be
               // running in a polyfill, which is not fast.
-              // (In older versions of emscripten, in particular pre-fastcomp, SIMD=1 was needed to get
-              // any SIMD output at all.)
 
 var USE_CLOSURE_COMPILER = 0; // Whether closure compiling is being run on this output
 
@@ -159,7 +154,7 @@ var SIMPLIFY_IFS = 1; // Whether to simplify ifs in js-optimizer.js
 
 // Generated code debugging options
 var SAFE_HEAP = 0; // Check each write to the heap, for example, this will give a clear
-                   // error on what would be segfaults in a native build (like deferencing
+                   // error on what would be segfaults in a native build (like dereferencing
                    // 0). See preamble.js for the actual checks performed.
 var SAFE_HEAP_LOG = 0; // Log out all SAFE_HEAP operations
 
@@ -531,6 +526,8 @@ var SWAPPABLE_ASM_MODULE = 0; // If 1, then all exports from the asm.js module w
                               // as we depend on them being a drop-in replacement for each
                               // other (same globals on the heap at the same locations, etc.)
 
+var SEPARATE_ASM = 0; // see emcc --separate-asm
+
 var PGO = 0; // Enables profile-guided optimization in the form of runtime checks for
              // which functions are actually called. Emits a list during shutdown that you
              // can pass to DEAD_FUNCTIONS (you can also emit the list manually by
diff --git a/system/include/emscripten/threading.h b/system/include/emscripten/threading.h
index f7805c4b9ea2d..3c798c7f49c57 100644
--- a/system/include/emscripten/threading.h
+++ b/system/include/emscripten/threading.h
@@ -112,6 +112,10 @@ int emscripten_is_main_runtime_thread(void);
 // Returns 1 if the current thread is the main browser thread.
 int emscripten_is_main_browser_thread(void);
 
+// A temporary workaround to issue https://github.com/kripken/emscripten/issues/3495:
+// Call this in the body of all lock-free atomic (cas) loops that the main thread might enter
+// which don't otherwise call to any pthread api calls (mutexes) or C runtime functions
+// that are considered cancellation points.
 void emscripten_main_thread_process_queued_calls();
 
 // Direct syscall access, second argument is a varargs pointer. used in proxying
diff --git a/tests/core/test_inlinejs3.in b/tests/core/test_inlinejs3.in
index c3b9b7690a12e..da57f26ef5555 100644
--- a/tests/core/test_inlinejs3.in
+++ b/tests/core/test_inlinejs3.in
@@ -26,6 +26,9 @@ int main(int argc, char **argv) {
   sum = 0;
   sum = EM_ASM_INT_V({ return globalVar }); // no inputs, just output
   printf("sum: %d\n", sum);
+  printf("|%.2f|\n", EM_ASM_DOUBLE({
+    return $0; // return double properly
+  }, 1.2));
   for (int i = 0; i < argc*2; i++) loop_iter();
   return 0;
 }
diff --git a/tests/core/test_inlinejs3.out b/tests/core/test_inlinejs3.out
index c48cc3c89993b..e65cddda353a2 100644
--- a/tests/core/test_inlinejs3.out
+++ b/tests/core/test_inlinejs3.out
@@ -12,5 +12,6 @@ i: 0,0.00
 i: 1,0.08
 i: 2,0.17
 sum: 6
+|1.20|
 loop iter!
 loop iter!
diff --git a/tests/optimizer/asm-eliminator-test.js b/tests/optimizer/asm-eliminator-test.js
index 3fd98a3c0b843..a80f613594508 100644
--- a/tests/optimizer/asm-eliminator-test.js
+++ b/tests/optimizer/asm-eliminator-test.js
@@ -130,11 +130,11 @@ function label() {
  }
 }
 function switchy() {
- var no = 0, yes = 0;
+ var yes1 = 0, yes = 0;
  var a = 0, b = 0;
  while (1) switch (label | 0) {
   case 1:
-   no = 100; // eliminatable in theory, but eliminator does not look into switch. must leave def above as well.
+   yes1 = 100;
    break;
   case 2:
    yes = 111;
diff --git a/tests/parallel_test_core.py b/tests/parallel_test_core.py
index d358304accdd9..d509b05331056 100755
--- a/tests/parallel_test_core.py
+++ b/tests/parallel_test_core.py
@@ -23,15 +23,38 @@ class Watcher(threading.Thread):
 
   def run(self):
     last = -1
-    while not Watcher.stop:
-      total = 0
+    bytes_read = {}
+    bytes = {}
+    for mode in optimal_order:
+      bytes[mode] = ''
+      bytes_read[mode] = 0
+
+    running = True
+    while running:
+      time.sleep(2)
+      if Watcher.stop: running = False
       for mode in optimal_order:
-        if os.path.exists(mode + '.err'):
-          total += os.stat(mode + '.err').st_size
-      if total != last:
-        last = total
-        print '[parallel_test_copy.py watcher] total output: %d' % total
-      time.sleep(10)
+        logfile = mode + '.err'
+        if os.path.exists(logfile):
+          new_size = os.stat(logfile).st_size
+          if new_size > bytes_read[mode]:
+            with open(logfile, 'rb') as f:
+              f.seek(bytes_read[mode])
+              bytes[mode] += f.read(new_size - bytes_read[mode])
+            bytes_read[mode] = new_size
+
+            # Flush printed lines to stdout if we have enough worth of one full test.
+            most_recent_test_start_pos = bytes[mode].rfind('(test_core.')
+            if most_recent_test_start_pos != -1:
+              most_recent_line_end = bytes[mode].rfind('\ntest_', 0, most_recent_test_start_pos)
+              if most_recent_line_end != -1:
+                lines_ready_to_print = bytes[mode][0:most_recent_line_end+1].strip()
+                print lines_ready_to_print
+                bytes[mode] = bytes[mode][most_recent_line_end+1:]
+
+          # Flush all the remaining lines if we are quitting.
+          if not running:
+            print bytes[mode]
 
 # run tests for one mode
 def run_mode(args):
@@ -58,6 +81,10 @@ def main():
     if os.path.exists(mode + '.err'):
       os.unlink(mode + '.err')
 
+  # prebuild the cache before starting the parallel run of the whole suite to avoid a race condition where each thread would start building the cache separately
+  proc = subprocess.Popen([PYTHON, path_from_root('embuilder.py'), 'build', 'ALL'])
+  proc.communicate()
+
   watcher = Watcher()
   watcher.start()
 
@@ -70,12 +97,6 @@ def main():
   # quit watcher
   Watcher.stop = True
 
-  # emit all outputs
-  for mode in optimal_order:
-    print '=== %s ===' % mode
-    if os.path.exists(mode + '.err'):
-      print open(mode + '.err').read()
-    print ''
   return sum(num_failures)
 
 if __name__ == '__main__':
diff --git a/tests/runner.py b/tests/runner.py
index 20c9eaa5f8d53..cde5c1b945ff4 100755
--- a/tests/runner.py
+++ b/tests/runner.py
@@ -259,9 +259,15 @@ def build(self, src, dirname, filename, output_processor=None, main_file=None, a
         assert ('/* memory initializer */' not in src) or ('/* memory initializer */ allocate([]' in src)
 
   def validate_asmjs(self, err):
-    if "asm.js type error: 'Float32x4' is not a standard SIMD type" in err:
-      err = err.replace("asm.js type error: 'Float32x4' is not a standard SIMD type", "")
-      print >> sys.stderr, "\nWARNING: ignoring asm.js type error due to old SpiderMonkey\n"
+    if "asm.js type error: 'Int8x16' is not a standard SIMD type" in err:
+      err = err.replace("asm.js type error: 'Int8x16' is not a standard SIMD type", "")
+      print >> sys.stderr, "\nWARNING: ignoring asm.js type error from Int8x16 due to implementation not yet available in SpiderMonkey\n"
+    if "asm.js type error: 'Int16x8' is not a standard SIMD type" in err:
+      err = err.replace("asm.js type error: 'Int16x8' is not a standard SIMD type", "")
+      print >> sys.stderr, "\nWARNING: ignoring asm.js type error from Int16x8 due to implementation not yet available in SpiderMonkey\n"
+    if "asm.js type error: 'Float64x2' is not a standard SIMD type" in err:
+      err = err.replace("asm.js type error: 'Float64x2' is not a standard SIMD type", "")
+      print >> sys.stderr, "\nWARNING: ignoring asm.js type error from Float64x2 due to implementation not yet available in SpiderMonkey\n"
 
     if 'uccessfully compiled asm.js code' in err and 'asm.js link error' not in err:
       print >> sys.stderr, "[was asm.js'ified]"
diff --git a/tests/sockets/test_sockets_echo_client.c b/tests/sockets/test_sockets_echo_client.c
index af01311d5f21f..b10cf76febabb 100644
--- a/tests/sockets/test_sockets_echo_client.c
+++ b/tests/sockets/test_sockets_echo_client.c
@@ -188,6 +188,22 @@ int main() {
     finish(EXIT_FAILURE);
   }
 
+  {
+    int z;
+    struct sockaddr_in adr_inet;
+    socklen_t len_inet = sizeof adr_inet;
+    z = getsockname(server.fd, (struct sockaddr *)&adr_inet, &len_inet);
+    if (z != 0) {
+      perror("getsockname");
+      finish(EXIT_FAILURE);
+    }
+    char buffer[1000];
+    sprintf(buffer, "%s:%u\n", inet_ntoa(adr_inet.sin_addr), (unsigned)ntohs(adr_inet.sin_port));
+    char *correct = "127.0.0.1:49161\n";
+    printf("got (expected) socket: %s (%s), size %d (%d)\n", buffer, correct, strlen(buffer), strlen(correct));
+    assert(strncmp(buffer, correct, 10) == 0);
+  }
+
 #ifdef __EMSCRIPTEN__
 #if TEST_ASYNC
   // The first parameter being passed is actually an arbitrary userData pointer
diff --git a/tests/test_browser.py b/tests/test_browser.py
index e80d72b2f7fad..1ca279c332204 100644
--- a/tests/test_browser.py
+++ b/tests/test_browser.py
@@ -1034,7 +1034,7 @@ def test_fs_workerfs_package(self):
   def test_fs_lz4fs_package(self):
     # generate data
     import random
-    try_delete('subdir')
+    self.clear()
     os.mkdir('subdir')
     open('file1.txt', 'w').write('0123456789' * (1024*128))
     open(os.path.join('subdir', 'file2.txt'), 'w').write('1234567890' * (1024*128))
@@ -1045,6 +1045,8 @@ def test_fs_lz4fs_package(self):
     # compress in emcc,  -s LZ4=1  tells it to tell the file packager
     print 'emcc-normal'
     self.btest(os.path.join('fs', 'test_lz4fs.cpp'), '2', args=['-s', 'LZ4=1', '--preload-file', 'file1.txt', '--preload-file', 'subdir/file2.txt', '--preload-file', 'file3.txt'], timeout=60)
+    assert os.stat('file1.txt').st_size + os.stat(os.path.join('subdir', 'file2.txt')).st_size + os.stat('file3.txt').st_size == 3*1024*128*10 + 1
+    assert os.stat('test.data').st_size < (3*1024*128*10)/2 # over half is gone
     print '    emcc-opts'
     self.btest(os.path.join('fs', 'test_lz4fs.cpp'), '2', args=['-s', 'LZ4=1', '--preload-file', 'file1.txt', '--preload-file', 'subdir/file2.txt', '--preload-file', 'file3.txt', '-O2'], timeout=60)
 
@@ -2800,6 +2802,9 @@ def test_aaa_pthread_supported(self):
     for args in [[], ['-s', 'USE_PTHREADS=1', '-s', 'PTHREAD_POOL_SIZE=8']]:
       self.btest(path_from_root('tests', 'pthread', 'test_pthread_supported.cpp'), expected='0', args=['-O3'] + args, timeout=30)
 
+  def test_aaa_separate_asm_pthreads(self):
+    self.btest(path_from_root('tests', 'pthread', 'test_pthread_atomics.cpp'), expected='0', args=['-O3', '-s', 'USE_PTHREADS=1', '-s', 'PTHREAD_POOL_SIZE=8', '--separate-asm', '--profiling'], timeout=30)
+
   # Test that it is possible to send a signal via calling alarm(timeout), which in turn calls to the signal handler set by signal(SIGALRM, func);
   def test_sigalrm(self):
     self.btest(path_from_root('tests', 'sigalrm.cpp'), expected='0', args=['-O3'], timeout=30)
diff --git a/tests/test_core.py b/tests/test_core.py
index 96dcf7716419c..213a23865d241 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -5710,9 +5710,10 @@ def test():
       test()
 
   def test_sse1(self):
-    self.banned_js_engines = [NODE_JS] # the test code hits NaN canonicalization on node.js
     if self.is_emterpreter(): return self.skip('todo')
-    if 'SAFE_HEAP=1' in self.emcc_args: return self.skip('SSE with SAFE_HEAP=1 breaks due to NaN canonicalization!')
+    if 'SAFE_HEAP=1' in self.emcc_args and SPIDERMONKEY_ENGINE in JS_ENGINES:
+      self.banned_js_engines += [SPIDERMONKEY_ENGINE]
+      print 'Skipping test_sse1 with SAFE_HEAP=1 on SpiderMonkey, since it fails due to NaN canonicalization.'
     Settings.PRECISE_F32 = 1 # SIMD currently requires Math.fround
 
     orig_args = self.emcc_args
@@ -5722,7 +5723,6 @@ def test_sse1(self):
 
   # Tests the full SSE1 API.
   def test_sse1_full(self):
-    self.banned_js_engines = [NODE_JS] # the test code hits NaN canonicalization on node.js
     if self.is_emterpreter(): return self.skip('todo')
     Popen([CLANG, path_from_root('tests', 'test_sse1_full.cpp'), '-o', 'test_sse1_full', '-D_CRT_SECURE_NO_WARNINGS=1'] + get_clang_native_args(), stdout=PIPE).communicate()
     native_result, err = Popen('./test_sse1_full', stdout=PIPE).communicate()
@@ -5736,19 +5736,17 @@ def test_sse1_full(self):
 
   # Tests the full SSE2 API.
   def test_sse2_full(self):
-    return self.skip('todo: No Float64x2 type available anymore.')
     if self.is_emterpreter(): return self.skip('todo')
-    if SPIDERMONKEY_ENGINE not in JS_ENGINES: return self.skip('test_sse2_full requires SpiderMonkey to run.')
-    if '-O1' in self.emcc_args or '-O2' in self.emcc_args or '-O3' in self.emcc_args or '-Oz' in self.emcc_args:
-      return self.skip('TODO: SIMD does not currently validate as asm.js in SpiderMonkey, run only in unoptimized mode.')
-    Popen([CLANG, path_from_root('tests', 'test_sse2_full.cpp'), '-o', 'test_sse2_full', '-D_CRT_SECURE_NO_WARNINGS=1'] + get_clang_native_args(), stdout=PIPE).communicate()
+    args = []
+    if '-O0' in self.emcc_args: args += ['-D_DEBUG=1']
+    Popen([CLANG, path_from_root('tests', 'test_sse2_full.cpp'), '-o', 'test_sse2_full', '-D_CRT_SECURE_NO_WARNINGS=1'] + args + get_clang_native_args(), stdout=PIPE).communicate()
     native_result, err = Popen('./test_sse2_full', stdout=PIPE).communicate()
     native_result = native_result.replace('\r\n', '\n') # Windows line endings fix
 
     Settings.PRECISE_F32 = 1 # SIMD currently requires Math.fround
     orig_args = self.emcc_args
     for mode in [[], ['-s', 'SIMD=1']]:
-      self.emcc_args = orig_args + mode + ['-I' + path_from_root('tests'), '-msse2']
+      self.emcc_args = orig_args + mode + ['-I' + path_from_root('tests'), '-msse2'] + args
       self.do_run(open(path_from_root('tests', 'test_sse2_full.cpp'), 'r').read(), native_result)
 
   def test_simd(self):
@@ -5770,10 +5768,7 @@ def test_simd2(self):
   def test_simd3(self):
     if self.is_emterpreter(): return self.skip('todo')
 
-    self.banned_js_engines = [NODE_JS] # fails in simd.js polyfill
-
-    if '-O1' in self.emcc_args or '-O2' in self.emcc_args or '-O3' in self.emcc_args or '-Oz' in self.emcc_args:
-      return self.skip('TODO: Fails under optimizations')
+    Settings.PRECISE_F32 = 1 # SIMD currently requires Math.fround
 
     test_path = path_from_root('tests', 'core', 'test_simd3')
     src, output = (test_path + s for s in ('.in', '.out'))
@@ -5842,11 +5837,6 @@ def test_simd10(self):
     # test_simd10 is to test that loading and storing arbitrary bit patterns works in SSE1.
     if self.is_emterpreter(): return self.skip('todo')
 
-    self.banned_js_engines = [NODE_JS] # the test code hits NaN canonicalization on node.js
-
-    if '-O1' in self.emcc_args or '-O2' in self.emcc_args or '-O3' in self.emcc_args or '-Oz' in self.emcc_args:
-      return self.skip('TODO: Compiler is too aggressive in optimizing and generates code that breaks due to NaN canonicalization! https://github.com/kripken/emscripten/issues/3403')
-
     test_path = path_from_root('tests', 'core', 'test_simd10')
     src, output = (test_path + s for s in ('.in', '.out'))
 
@@ -5857,11 +5847,6 @@ def test_simd11(self):
     # test_simd11 is to test that _mm_movemask_ps works correctly when handling input floats with 0xFFFFFFFF NaN bit patterns.
     if self.is_emterpreter(): return self.skip('todo')
 
-    self.banned_js_engines = [NODE_JS] # the test code hits NaN canonicalization on node.js
-
-    if '-O1' in self.emcc_args or '-O2' in self.emcc_args or '-O3' in self.emcc_args or '-Oz' in self.emcc_args:
-      return self.skip('TODO: Compiler is too aggressive in optimizing and generates code that breaks due to NaN canonicalization! https://github.com/kripken/emscripten/issues/3403')
-
     test_path = path_from_root('tests', 'core', 'test_simd11')
     src, output = (test_path + s for s in ('.in', '.out'))
 
@@ -7439,27 +7424,6 @@ def test_minmax(self):
   def test_locale(self):
     self.do_run_from_file(path_from_root('tests', 'test_locale.c'), path_from_root('tests', 'test_locale.out'))
 
-  def test_sixtyfour_bit_return_value(self):
-    # This test checks that the most significant 32 bits of a 64 bit long are correctly made available
-    # to native JavaScript applications that wish to interact with compiled code returning 64 bit longs.
-    # The MS 32 bits should be available in Runtime.getTempRet0() even when compiled with -O2 --closure 1
-
-    # Compile test.c and wrap it in a native JavaScript binding so we can call our compiled function from JS.
-    Popen([PYTHON, EMCC, path_from_root('tests', 'return64bit', 'test.c'), '--pre-js', path_from_root('tests', 'return64bit', 'testbindstart.js'), '--pre-js', path_from_root('tests', 'return64bit', 'testbind.js'), '--post-js', path_from_root('tests', 'return64bit', 'testbindend.js'), '-s', 'EXPORTED_FUNCTIONS=["_test"]', '-o', 'test.js', '-O2', '--closure', '1'], stdout=PIPE, stderr=PIPE).communicate()
-
-    # Simple test program to load the test.js binding library and call the binding to the
-    # C function returning the 64 bit long.
-    open(os.path.join(self.get_dir(), 'testrun.js'), 'w').write('''
-      var test = require("./test.js");
-      test.runtest();
-    ''')
-
-    # Run the test and confirm the output is as expected.
-    if NODE_JS in JS_ENGINES:
-      out = run_js('testrun.js', engine=NODE_JS, full_output=True)
-      assert "low = 5678" in out
-      assert "high = 1234" in out
-
   def test_async(self):
     self.banned_js_engines = [SPIDERMONKEY_ENGINE, V8_ENGINE] # needs setTimeout which only node has
 
diff --git a/tests/test_other.py b/tests/test_other.py
index 737fce8e7153a..260a08745b191 100644
--- a/tests/test_other.py
+++ b/tests/test_other.py
@@ -426,6 +426,7 @@ def check_makefile(configuration, dirname):
               if invoke_method == 'cmake':
                 # Test invoking cmake directly.
                 cmd = ['cmake', '-DCMAKE_TOOLCHAIN_FILE='+path_from_root('cmake', 'Modules', 'Platform', 'Emscripten.cmake'),
+                                '-DCMAKE_CROSSCOMPILING_EMULATOR="' + ' '.join(NODE_JS) + '"',
                                 '-DCMAKE_BUILD_TYPE=' + configuration, cmake_arguments[i], '-G', generator, cmakelistsdir]
                 env = tools.shared.Building.remove_sh_exe_from_path(os.environ)
               else:
@@ -433,6 +434,7 @@ def check_makefile(configuration, dirname):
                 cmd = [emconfigure, 'cmake', '-DCMAKE_BUILD_TYPE=' + configuration, cmake_arguments[i], '-G', generator, cmakelistsdir]
                 env = os.environ.copy()
 
+              print str(cmd)
               ret = Popen(cmd, stdout=None if EM_BUILD_VERBOSE_LEVEL >= 2 else PIPE, stderr=None if EM_BUILD_VERBOSE_LEVEL >= 1 else PIPE, env=env).communicate()
               if len(ret) > 1 and ret[1] != None and len(ret[1].strip()) > 0:
                 logging.error(ret[1]) # If there were any errors, print them directly to console for diagnostics.
@@ -689,7 +691,6 @@ def measure_funcs(filename):
           Popen([PYTHON, EMCC, src] + libs + ['-o', 'test.js', '-O2'] + debug + ['-s', 'OUTLINING_LIMIT=%d' % outlining_limit] + args).communicate()
           assert os.path.exists('test.js')
           shutil.copyfile('test.js', '%d_test.js' % outlining_limit)
-          assert len(JS_ENGINES) > 1
           for engine in JS_ENGINES:
             if engine == V8_ENGINE: continue # ban v8, weird failures
             out = run_js('test.js', engine=engine, stderr=PIPE, full_output=True)
@@ -1976,6 +1977,8 @@ def test_embind(self):
         assert "FAIL" not in output, output
 
   def test_llvm_nativizer(self):
+    if WINDOWS: return self.skip('test_llvm_nativizer does not work on Windows: https://github.com/kripken/emscripten/issues/702')
+    if OSX: return self.skip('test_llvm_nativizer does not work on OS X: https://github.com/kripken/emscripten/issues/709')
     try:
       Popen(['as', '--version'], stdout=PIPE, stderr=PIPE).communicate()
     except:
@@ -1986,8 +1989,10 @@ def test_llvm_nativizer(self):
     open(os.path.join(self.get_dir(), 'somefile.binary'), 'w').write('''waka waka############################''')
     open(os.path.join(self.get_dir(), 'test.file'), 'w').write('''ay file..............,,,,,,,,,,,,,,''')
     open(os.path.join(self.get_dir(), 'stdin'), 'w').write('''inter-active''')
-    Popen([PYTHON, EMCC, os.path.join(self.get_dir(), 'files.cpp'), '-c']).communicate()
-    Popen([PYTHON, path_from_root('tools', 'nativize_llvm.py'), os.path.join(self.get_dir(), 'files.o')], stdout=PIPE, stderr=PIPE).communicate(input)
+    subprocess.check_call([PYTHON, EMCC, os.path.join(self.get_dir(), 'files.cpp'), '-c'])
+    nativize_llvm = Popen([PYTHON, path_from_root('tools', 'nativize_llvm.py'), os.path.join(self.get_dir(), 'files.o')], stdout=PIPE, stderr=PIPE)
+    nativize_llvm.communicate(input)
+    assert nativize_llvm.returncode == 0
     output = Popen([os.path.join(self.get_dir(), 'files.o.run')], stdin=open(os.path.join(self.get_dir(), 'stdin')), stdout=PIPE, stderr=PIPE).communicate()
     self.assertContained('''size: 37
 data: 119,97,107,97,32,119,97,107,97,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35,35
@@ -2729,7 +2734,7 @@ def test_incorrect_static_call(self):
 
   def test_llvm_lit(self):
     llvm_src = get_fastcomp_src_dir()
-    cmd = [os.path.join(LLVM_ROOT, 'llvm-lit'), '-v', os.path.join(llvm_src, 'test', 'CodeGen', 'JS')]
+    cmd = [PYTHON, os.path.join(LLVM_ROOT, 'llvm-lit.py'), '-v', os.path.join(llvm_src, 'test', 'CodeGen', 'JS')]
     print cmd
     p = Popen(cmd)
     p.communicate()
@@ -2915,23 +2920,26 @@ def test_bad_function_pointer_cast(self):
       for safe in [0, 1]:
         for emulate_casts in [0, 1]:
           for emulate_fps in [0, 1]:
-            cmd = [PYTHON, EMCC, 'src.cpp', '-O' + str(opts), '-s', 'SAFE_HEAP=' + str(safe)]
-            if emulate_casts:
-              cmd += ['-s', 'EMULATE_FUNCTION_POINTER_CASTS=1']
-            if emulate_fps:
-              cmd += ['-s', 'EMULATED_FUNCTION_POINTERS=1']
-            print cmd
-            Popen(cmd).communicate()
-            output = run_js('a.out.js', stderr=PIPE, full_output=True, assert_returncode=None)
-            if emulate_casts:
-              assert 'Hello, world.' in output, output
-            elif safe:
-              assert 'Function table mask error' in output, output
-            else:
-              if opts == 0:
-                assert 'Invalid function pointer called' in output, output
+            for relocate in [0, 1]:
+              cmd = [PYTHON, EMCC, 'src.cpp', '-O' + str(opts), '-s', 'SAFE_HEAP=' + str(safe)]
+              if emulate_casts:
+                cmd += ['-s', 'EMULATE_FUNCTION_POINTER_CASTS=1']
+              if emulate_fps:
+                cmd += ['-s', 'EMULATED_FUNCTION_POINTERS=1']
+              if relocate:
+                cmd += ['-s', 'RELOCATABLE=1'] # disables asm-optimized safe heap
+              print cmd
+              Popen(cmd).communicate()
+              output = run_js('a.out.js', stderr=PIPE, full_output=True, assert_returncode=None)
+              if emulate_casts:
+                assert 'Hello, world.' in output, output
+              elif safe:
+                assert 'Function table mask error' in output, output
               else:
-                assert 'abort()' in output, output
+                if opts == 0:
+                  assert 'Invalid function pointer called' in output, output
+                else:
+                  assert 'abort()' in output, output
 
   def test_aliased_func_pointers(self):
     open('src.cpp', 'w').write(r'''
@@ -5166,3 +5174,49 @@ def test_no_warnings(self):
     out, err = Popen([PYTHON, EMCC, path_from_root('tests', 'hello_libcxx.cpp')], stderr=PIPE).communicate()
     assert err == '', err
 
+  def test_emterpreter_file_suggestion(self):
+    for linkable in [0, 1]:
+      for to_file in [0, 1]:
+        self.clear()
+        cmd = [PYTHON, EMCC, '-s', 'EMTERPRETIFY=1', path_from_root('tests', 'hello_libcxx.cpp'), '-s', 'LINKABLE=' + str(linkable), '-O1', '-s', 'USE_ZLIB=1']
+        if to_file:
+          cmd += ['-s', 'EMTERPRETIFY_FILE="code.dat"']
+        print cmd
+        stdout, stderr = Popen(cmd, stderr=PIPE).communicate()
+        need_warning = linkable and not to_file
+        assert ('''warning: emterpreter bytecode is fairly large''' in stderr) == need_warning, stderr
+        assert ('''It is recommended to use  -s EMTERPRETIFY_FILE=..''' in stderr) == need_warning, stderr
+
+  def test_llvm_lto(self):
+    sizes = {}
+    for lto in [0, 1, 2, 3]:
+      cmd = [PYTHON, EMCC, path_from_root('tests', 'hello_libcxx.cpp'), '-O2', '--llvm-lto', str(lto)]
+      print cmd
+      check_execute(cmd)
+      self.assertContained('hello, world!', run_js('a.out.js'))
+      sizes[lto] = os.stat('a.out.js').st_size
+    print sizes
+    assert sizes[1] < sizes[0] # lto reduces size
+    assert sizes[2] > sizes[0] # fake lto is aggressive at increasing code size
+    assert sizes[3] not in set([sizes[0], sizes[1], sizes[2]]) # mode 3 is different (deterministic builds means this tests an actual change)
+
+  def test_sixtyfour_bit_return_value(self):
+    # This test checks that the most significant 32 bits of a 64 bit long are correctly made available
+    # to native JavaScript applications that wish to interact with compiled code returning 64 bit longs.
+    # The MS 32 bits should be available in Runtime.getTempRet0() even when compiled with -O2 --closure 1
+
+    # Compile test.c and wrap it in a native JavaScript binding so we can call our compiled function from JS.
+    check_execute([PYTHON, EMCC, path_from_root('tests', 'return64bit', 'test.c'), '--pre-js', path_from_root('tests', 'return64bit', 'testbindstart.js'), '--pre-js', path_from_root('tests', 'return64bit', 'testbind.js'), '--post-js', path_from_root('tests', 'return64bit', 'testbindend.js'), '-s', 'EXPORTED_FUNCTIONS=["_test"]', '-o', 'test.js', '-O2', '--closure', '1'])
+
+    # Simple test program to load the test.js binding library and call the binding to the
+    # C function returning the 64 bit long.
+    open(os.path.join(self.get_dir(), 'testrun.js'), 'w').write('''
+      var test = require("./test.js");
+      test.runtest();
+    ''')
+
+    # Run the test and confirm the output is as expected.
+    out = run_js('testrun.js', full_output=True)
+    assert "low = 5678" in out
+    assert "high = 1234" in out
+
diff --git a/tests/test_sockets.py b/tests/test_sockets.py
index f84263f30752c..f5c4e7fc28133 100644
--- a/tests/test_sockets.py
+++ b/tests/test_sockets.py
@@ -253,6 +253,34 @@ def test_inet4(self):
 ok.
 ''')
 
+  def test_getsockname_null(self):
+    self.do_run(r'''
+      #include <sys/socket.h>
+      #include <stdio.h>
+      #include <assert.h>
+      #include <sys/socket.h>
+      #include <netinet/in.h>
+      #include <arpa/inet.h> 
+      #include <string.h>
+      int main() {
+        int fd;
+        int z;
+        fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+        struct sockaddr_in adr_inet;
+        socklen_t len_inet = sizeof adr_inet;
+        z = getsockname(fd, (struct sockaddr *)&adr_inet, &len_inet);
+        if (z != 0) {
+          perror("getsockname error");
+          return 1;
+        }
+        char buffer[1000];
+        sprintf(buffer, "%s:%u\n", inet_ntoa(adr_inet.sin_addr), (unsigned)ntohs(adr_inet.sin_port));
+        char *correct = "0.0.0.0:0\n";
+        printf("got (expected) socket: %s (%s), size %d (%d)\n", buffer, correct, strlen(buffer), strlen(correct));
+        puts("success.");
+      }
+    ''', 'success.')
+
   def test_getaddrinfo(self):
     self.emcc_args=[]
     self.do_run(open(path_from_root('tests', 'sockets', 'test_getaddrinfo.c')).read(), 'success')
diff --git a/tests/test_sse1_full.cpp b/tests/test_sse1_full.cpp
index a45350b25d7a2..607081f8256ec 100644
--- a/tests/test_sse1_full.cpp
+++ b/tests/test_sse1_full.cpp
@@ -25,7 +25,7 @@ int main()
 	Ret_M128_M128(__m128, _mm_sub_ss);
 
 	// SSE1 Elementary Math functions:
-#if 0 // TODO: Precision differs in SIMD.js and native. Test differently
+#if 0 // TODO: Precision differs in SIMD.js and native. Test differently. See https://github.com/kripken/emscripten/issues/3049
 	Ret_M128(__m128, _mm_rcp_ps);
 	Ret_M128(__m128, _mm_rcp_ss);
 	Ret_M128(__m128, _mm_rsqrt_ps);
diff --git a/tests/test_sse2_full.cpp b/tests/test_sse2_full.cpp
index f424803b0ff49..afa146a39989a 100644
--- a/tests/test_sse2_full.cpp
+++ b/tests/test_sse2_full.cpp
@@ -5,53 +5,83 @@
 #define ENABLE_SSE2
 #include "test_sse_full.h"
 
-int main()
-{
-	float *interesting_floats = get_interesting_floats();
-	int numInterestingFloats = sizeof(interesting_floats_)/sizeof(interesting_floats_[0]);
-	assert(numInterestingFloats % 4 == 0);
+// We don't have an Int64x2 type, but we do emulate in scalar Int64x2 code. However, 
+// the PNaCl ExpandI64.cpp path fails to expand our emulated code when LLVM wants to pack or
+// unpack the 64bit elements to/from vectors. Therefore skip that path for now in release builds.
+// (debug builds work ok, since they avoid optimized smartness)
+// See https://github.com/kripken/emscripten/issues/3788
+#ifndef _DEBUG
+#define NO_INT64X2
+#endif
 
-	uint32_t *interesting_ints = get_interesting_ints();
-	int numInterestingInts = sizeof(interesting_ints_)/sizeof(interesting_ints_[0]);
-	assert(numInterestingInts % 4 == 0);
+#ifndef _DEBUG
+// The following tests break when optimizer is applied, so disable them for now. Baby steps.
+// See https://github.com/kripken/emscripten/issues/3789
+#define BREAKS_UNDER_OPTIMIZATION
+#endif
 
-	double *interesting_doubles = get_interesting_doubles();
-	int numInterestingDoubles = sizeof(interesting_doubles_)/sizeof(interesting_doubles_[0]);
-	assert(numInterestingDoubles % 4 == 0);
+float *interesting_floats = get_interesting_floats();
+int numInterestingFloats = sizeof(interesting_floats_)/sizeof(interesting_floats_[0]);
+uint32_t *interesting_ints = get_interesting_ints();
+int numInterestingInts = sizeof(interesting_ints_)/sizeof(interesting_ints_[0]);
+double *interesting_doubles = get_interesting_doubles();
+int numInterestingDoubles = sizeof(interesting_doubles_)/sizeof(interesting_doubles_[0]);
 
+void test_arithmetic()
+{
 	// SSE2 Arithmetic instructions:
 	M128i_M128i_M128i(_mm_add_epi16);
 	M128i_M128i_M128i(_mm_add_epi32);
+#ifndef NO_INT64X2
 	M128i_M128i_M128i(_mm_add_epi64);
+#endif
 	M128i_M128i_M128i(_mm_add_epi8);
 	Ret_M128d_M128d(__m128d, _mm_add_pd);
 	Ret_M128d_M128d(__m128d, _mm_add_sd);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_adds_epi16);
+#endif
 	M128i_M128i_M128i(_mm_adds_epi8);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_adds_epu16);
+#endif
 	M128i_M128i_M128i(_mm_adds_epu8);
 	Ret_M128d_M128d(__m128d, _mm_div_pd);
 	Ret_M128d_M128d(__m128d, _mm_div_sd);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_madd_epi16);
+#endif
+#ifndef NO_INT64X2
 	M128i_M128i_M128i(_mm_mul_epu32);
-
+#endif
 	Ret_M128d_M128d(__m128d, _mm_mul_pd);
 	Ret_M128d_M128d(__m128d, _mm_mul_sd);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_mulhi_epi16);
 	M128i_M128i_M128i(_mm_mulhi_epu16);
+#endif
 	M128i_M128i_M128i(_mm_mullo_epi16);
 	M128i_M128i_M128i(_mm_sad_epu8);
 	M128i_M128i_M128i(_mm_sub_epi16);
 	M128i_M128i_M128i(_mm_sub_epi32);
+#ifndef NO_INT64X2
 	M128i_M128i_M128i(_mm_sub_epi64);
+#endif
 	M128i_M128i_M128i(_mm_sub_epi8);
 	Ret_M128d_M128d(__m128d, _mm_sub_pd);
 	Ret_M128d_M128d(__m128d, _mm_sub_sd);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_subs_epi16);
+#endif
 	M128i_M128i_M128i(_mm_subs_epi8);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_subs_epu16);
+#endif
 	M128i_M128i_M128i(_mm_subs_epu8);
+}
 
+void test_cast()
+{
 	// SSE2 Cast functions:
 	Ret_M128d(__m128, _mm_castpd_ps);
 	Ret_M128d(__m128i, _mm_castpd_si128);
@@ -59,7 +89,10 @@ int main()
 	Ret_M128(__m128i, _mm_castps_si128);
 	Ret_M128i(__m128d, _mm_castsi128_pd);
 	Ret_M128i(__m128, _mm_castsi128_ps);
+}
 
+void test_compare()
+{
 	// SSE2 Compare instructions:
 	M128i_M128i_M128i(_mm_cmpeq_epi16);
 	M128i_M128i_M128i(_mm_cmpeq_epi32);
@@ -106,52 +139,80 @@ int main()
 	Ret_M128d_M128d(int, _mm_ucomile_sd);
 	Ret_M128d_M128d(int, _mm_ucomilt_sd);
 	Ret_M128d_M128d(int, _mm_ucomineq_sd);
+}
 
+void test_convert()
+{
 	// SSE2 Convert instructions:
 	Ret_M128i(__m128d, _mm_cvtepi32_pd);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	Ret_M128i(__m128, _mm_cvtepi32_ps);
+#endif
 	Ret_M128d(__m128i, _mm_cvtpd_epi32);
 	Ret_M128d(__m128, _mm_cvtpd_ps);
 	Ret_M128(__m128i, _mm_cvtps_epi32);
 	Ret_M128(__m128d,  _mm_cvtps_pd);
 	Ret_M128(double, _mm_cvtsd_f64);
 	Ret_M128d(int, _mm_cvtsd_si32);
+#ifndef NO_INT64X2
 	Ret_M128d(int64_t, _mm_cvtsd_si64); // _mm_cvtsd_si64x is an alias to this.
+#endif
 	Ret_M128i(int, _mm_cvtsi128_si32);
+#ifndef NO_INT64X2
 	Ret_M128i(int64_t, _mm_cvtsi128_si64); // _mm_cvtsi128_si64x is an alias to this.
+#endif
 	Ret_M128d_int(__m128d, _mm_cvtsi32_sd);
 	Ret_int(__m128i, _mm_cvtsi32_si128);
+#ifndef NO_INT64X2
 	Ret_M128d_int64(__m128d, _mm_cvtsi64_sd); // _mm_cvtsi64x_sd is an alias to this.
 	Ret_int64(__m128i, _mm_cvtsi64_si128); // _mm_cvtsi64x_si128 is an alias to this.
+#endif
 	Ret_M128d_M128d(__m128d, _mm_cvtss_sd);
 	Ret_M128d(__m128i, _mm_cvttpd_epi32);
 	Ret_M128(__m128i, _mm_cvttps_epi32);
 	Ret_M128d(int, _mm_cvttsd_si32);
+#ifndef NO_INT64X2
 	Ret_M128d(int64_t, _mm_cvttsd_si64); // _mm_cvttsd_si64x is an alias to this.
-
+#endif
+}
+void test_elementarymath()
+{
 	// SSE2 Elementary Math Functions instructions:
 	Ret_M128d(__m128d, _mm_sqrt_pd);
 	Ret_M128d_M128d(__m128d, _mm_sqrt_sd);
+}
 
+void test_generalsupport()
+{
 	// SSE2 General Support instructions:
 	_mm_clflush(interesting_floats);
 	_mm_lfence();
 	_mm_mfence();
 	_mm_pause();
+}
 
+void test_load()
+{
 	// SSE2 Load functions:
 	Ret_DoublePtr(__m128d, _mm_load_pd, 2, 2);
 	Ret_DoublePtr(__m128d, _mm_load_pd1, 1, 1);
 	Ret_DoublePtr(__m128d, _mm_load_sd, 1, 1);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	Ret_IntPtr(__m128i, _mm_load_si128, __m128i*, 4, 4);
+#endif
 	Ret_DoublePtr(__m128d, _mm_load1_pd, 1, 1);
 	Ret_M128d_DoublePtr(__m128d, _mm_loadh_pd, double*, 1, 1);
 	Ret_IntPtr(__m128i, _mm_loadl_epi64, __m128i*, 2, 1);
 	Ret_M128d_DoublePtr(__m128d, _mm_loadl_pd, double*, 1, 1);
 	Ret_DoublePtr(__m128d, _mm_loadr_pd, 2, 2);
 	Ret_DoublePtr(__m128d, _mm_loadu_pd, 2, 1);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	Ret_IntPtr(__m128i, _mm_loadu_si128, __m128i*, 4, 1);
+#endif
+}
 
+void test_logical()
+{
 	// SSE2 Logical instructions:
 	Ret_M128d_M128d(__m128d, _mm_and_pd);
 	M128i_M128i_M128i(_mm_and_si128);
@@ -161,21 +222,42 @@ int main()
 	M128i_M128i_M128i(_mm_or_si128);
 	Ret_M128d_M128d(__m128d, _mm_xor_pd);
 	M128i_M128i_M128i(_mm_xor_si128);
+}
 
+void test_misc()
+{
 	// SSE2 Miscellaneous instructions:
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	Ret_M128i(int, _mm_movemask_epi8);
+#endif
 	Ret_M128d(int, _mm_movemask_pd);
 	M128i_M128i_M128i(_mm_packs_epi16);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_packs_epi32);
+#endif
 	M128i_M128i_M128i(_mm_packus_epi16);
+}
 
+void test_move()
+{
 	// SSE2 Move instructions:
+#ifndef NO_INT64X2
 	Ret_M128i(__m128i, _mm_move_epi64);
+#endif
 	Ret_M128d_M128d(__m128d, _mm_move_sd);
+}
 
+void test_probability()
+{
 	// SSE2 Probability/Statistics instructions:
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_avg_epu16);
+#endif
 	M128i_M128i_M128i(_mm_avg_epu8);
+}
+
+void test_set()
+{
 /*
 	// SSE2 Set functions:
 	_mm_set_epi16
@@ -200,14 +282,21 @@ int main()
 	_mm_setzero_pd
 	_mm_setzero_si128
 */
+}
 
+void test_shift()
+{
 	// SSE2 Shift instructions:
 	M128i_M128i_M128i(_mm_sll_epi16);
 	M128i_M128i_M128i(_mm_sll_epi32);
+#ifndef NO_INT64X2
 	M128i_M128i_M128i(_mm_sll_epi64);
+#endif
 	Ret_M128i_Tint(__m128i, _mm_slli_epi16);
 	Ret_M128i_Tint(__m128i, _mm_slli_epi32);
+#ifndef NO_INT64X2
 	Ret_M128i_Tint(__m128i, _mm_slli_epi64);
+#endif
 	Ret_M128i_Tint(__m128i, _mm_slli_si128); // _mm_bslli_si128 is an alias to this.
 	M128i_M128i_M128i(_mm_sra_epi16);
 	M128i_M128i_M128i(_mm_sra_epi32);
@@ -215,22 +304,38 @@ int main()
 	Ret_M128i_Tint(__m128i, _mm_srai_epi32);
 	M128i_M128i_M128i(_mm_srl_epi16);
 	M128i_M128i_M128i(_mm_srl_epi32);
+#ifndef NO_INT64X2
 	M128i_M128i_M128i(_mm_srl_epi64);
+#endif
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	Ret_M128i_Tint(__m128i, _mm_srli_epi16);
+#endif
 	Ret_M128i_Tint(__m128i, _mm_srli_epi32);
+#ifndef NO_INT64X2
 	Ret_M128i_Tint(__m128i, _mm_srli_epi64);
+#endif
 	Ret_M128i_Tint(__m128i, _mm_srli_si128); // _mm_bsrli_si128 is an alias to this.
+}
 
+void test_specialmath()
+{
 	// SSE2 Special Math instructions:
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_max_epi16);
 	M128i_M128i_M128i(_mm_max_epu8);
+#endif
 	Ret_M128d_M128d(__m128d, _mm_max_pd);
 	Ret_M128d_M128d(__m128d, _mm_max_sd);
+#ifndef BREAKS_UNDER_OPTIMIZATION
 	M128i_M128i_M128i(_mm_min_epi16);
 	M128i_M128i_M128i(_mm_min_epu8);
+#endif
 	Ret_M128d_M128d(__m128d, _mm_min_pd);
 	Ret_M128d_M128d(__m128d, _mm_min_sd);
+}
 
+void test_store()
+{
 	// SSE2 Store instructions:
 	void_M128i_M128i_OutIntPtr(_mm_maskmoveu_si128, char*, 16, 1);
 	void_OutDoublePtr_M128d(_mm_store_pd, double*, 16, 16);
@@ -238,7 +343,9 @@ int main()
 	void_OutIntPtr_M128(_mm_store_si128, __m128i*, 16, 16);
 	void_OutDoublePtr_M128d(_mm_store1_pd, double*, 16, 16); // _mm_store_pd1 is an alias to this.
 	void_OutDoublePtr_M128d(_mm_storeh_pd, double*, 8, 1);
+#ifndef NO_INT64X2
 	void_OutIntPtr_M128(_mm_storel_epi64, __m128i*, 8, 1);
+#endif
 	void_OutDoublePtr_M128d(_mm_storel_pd, double*, 8, 1);
 	void_OutDoublePtr_M128d(_mm_storer_pd, double*, 16, 16);
 	void_OutDoublePtr_M128d(_mm_storeu_pd, double*, 16, 1);
@@ -246,8 +353,13 @@ int main()
 	void_OutDoublePtr_M128d(_mm_stream_pd, double*, 16, 16);
 	void_OutIntPtr_M128(_mm_stream_si128, __m128i*, 16, 16);
 	void_OutIntPtr_int(_mm_stream_si32, int*, 4, 1);
+#ifndef NO_INT64X2
 	void_OutIntPtr_int64(_mm_stream_si64, int64_t*, 8, 1);
+#endif
+}
 
+void test_swizzle()
+{
 	// SSE2 Swizzle instructions:
 	Ret_M128i_Tint(int, _mm_extract_epi16);
 	Ret_M128i_int_Tint(__m128i, _mm_insert_epi16);
@@ -257,12 +369,40 @@ int main()
 	Ret_M128i_Tint(__m128i, _mm_shufflelo_epi16);
 	M128i_M128i_M128i(_mm_unpackhi_epi16);
 	M128i_M128i_M128i(_mm_unpackhi_epi32);
+#ifndef NO_INT64X2
 	M128i_M128i_M128i(_mm_unpackhi_epi64);
+#endif
 	M128i_M128i_M128i(_mm_unpackhi_epi8);
 	Ret_M128d_M128d(__m128d, _mm_unpackhi_pd);
 	M128i_M128i_M128i(_mm_unpacklo_epi16);
 	M128i_M128i_M128i(_mm_unpacklo_epi32);
+#ifndef NO_INT64X2
 	M128i_M128i_M128i(_mm_unpacklo_epi64);
+#endif
 	M128i_M128i_M128i(_mm_unpacklo_epi8);
 	Ret_M128d_M128d(__m128d, _mm_unpacklo_pd);
 }
+
+int main()
+{
+	assert(numInterestingFloats % 4 == 0);
+	assert(numInterestingInts % 4 == 0);
+	assert(numInterestingDoubles % 4 == 0);	
+
+	test_arithmetic();
+	test_cast();
+	test_compare();
+	test_convert();
+	test_elementarymath();
+	test_generalsupport();
+	test_load();
+	test_logical();
+	test_misc();
+	test_move();
+	test_probability();
+	test_set();
+	test_shift();
+	test_specialmath();
+	test_store();
+	test_swizzle();	
+}
diff --git a/tools/emterpretify.py b/tools/emterpretify.py
index 0184ecfe720e9..1c6a0f31135ec 100755
--- a/tools/emterpretify.py
+++ b/tools/emterpretify.py
@@ -1044,6 +1044,9 @@ def post_process_code(code):
 ''' % (len(all_code), all_code[0], all_code[1], all_code[2], all_code[3], len(relocations), relocations[0])]
 
   else:
+    if len(all_code) > 1024*1024:
+      shared.logging.warning('warning: emterpreter bytecode is fairly large, %.2f MB. It is recommended to use  -s EMTERPRETIFY_FILE=..  so that it is saved as a binary file, instead of the default behavior which is to embed it as text (as text, it can cause very slow compile and startup times)' % (len(all_code) / (1024*1024.)))
+
     CHUNK_SIZE = 10240
 
     i = 0
diff --git a/tools/file_packager.py b/tools/file_packager.py
index 871b193a282cb..89516ccec024d 100644
--- a/tools/file_packager.py
+++ b/tools/file_packager.py
@@ -138,7 +138,7 @@
     try:
       from shared import CRUNCH
     except Exception, e:
-      print >> sys.stderr, 'could not import CRUNCH (make sure it is defined properly in ~/.emscripten)'
+      print >> sys.stderr, 'could not import CRUNCH (make sure it is defined properly in ' + shared.hint_config_file_location() + ')'
       raise e
     crunch = arg.split('=')[1] if '=' in arg else '128'
     leading = ''
diff --git a/tools/js-optimizer.js b/tools/js-optimizer.js
index 00399c27da0a2..ba03816b600b2 100644
--- a/tools/js-optimizer.js
+++ b/tools/js-optimizer.js
@@ -4,8 +4,11 @@
 //==============================================================================
 // Optimizer tool. This is meant to be run after the emscripten compiler has
 // finished generating code. These optimizations are done on the generated
-// code to further improve it. Some of the modifications also work in
-// conjunction with closure compiler.
+// code to further improve it.
+//
+// Be aware that this is *not* a general JS optimizer. It assumes that the
+// input is valid asm.js and makes strong assumptions based on this. It may do
+// anything from crashing to optimizing incorrectly if the input is not valid!
 //
 // TODO: Optimize traverse to modify a node we want to replace, in-place,
 //       instead of returning it to the previous call frame where we check?
@@ -2393,7 +2396,7 @@ function registerize(ast) {
     // we just use a fresh register to make sure we avoid this, but it could be
     // optimized to check for safe registers (free, and not used in this loop level).
     var varRegs = {}; // maps variables to the register they will use all their life
-    var freeRegsClasses = asm ? [[], [], [], [], [], []] : []; // two classes for asm, one otherwise XXX - hardcoded length
+    var freeRegsClasses = asm ? [[], [], [], [], [], [], [], [], []] : []; // two classes for asm, one otherwise XXX - hardcoded length
     var nextReg = 1;
     var fullNames = {};
     var loopRegs = {}; // for each loop nesting level, the list of bound variables
@@ -2556,8 +2559,8 @@ function registerizeHarder(ast) {
     // Utilities for allocating register variables.
     // We need distinct register pools for each type of variable.
 
-    var allRegsByType = [{}, {}, {}, {}, {}, {}]; // XXX - hardcoded length
-    var regPrefixByType = ['i', 'd', 'f', 'F4', 'I4', 'n'];
+    var allRegsByType = [{}, {}, {}, {}, {}, {}, {}, {}, {}]; // XXX - hardcoded length
+    var regPrefixByType = ['i', 'd', 'f', 'F4', 'F2', 'I16', 'I8', 'I4', 'n'];
     var nextReg = 1;
 
     function createReg(forName) {
@@ -4066,7 +4069,9 @@ function eliminate(ast, memSafe) {
             for (var j = 0; j < stats.length; j++) {
               traverseInOrder(stats[j]);
             }
-            // We cannot track from one switch case into another, undo all new trackings TODO: general framework here, use in if-else as well
+            // We cannot track from one switch case into another if there are external dependencies, undo all new trackings
+            // Otherwise we can track, e.g. a var used in a case before assignment in another case is UB in asm.js, so no need for the assignment
+            // TODO: general framework here, use in if-else as well
             for (var t in tracked) {
               if (!(t in originalTracked)) {
                 var info = tracked[t];
@@ -5642,73 +5647,77 @@ function safeHeap(ast) {
     ptr = ['binary', '|', ptr, ['num', 0]];
     return ptr;
   }
-  traverseGenerated(ast, function(node, type) {
-    if (type === 'assign') {
-      if (node[1] === true && node[2][0] === 'sub') {
-        var heap = node[2][1][1];
-        var ptr = fixPtr(node[2][2], heap);
-        var value = node[3];
-        // SAFE_HEAP_STORE(ptr, value, bytes, isFloat) 
-        switch (heap) {
-          case 'HEAP8':   case 'HEAPU8': {
-            return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_INT), ['num', 1], ['num', 0]]];
-          }
-          case 'HEAP16':  case 'HEAPU16': {
-            return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_INT), ['num', 2], ['num', 0]]];
-          }
-          case 'HEAP32':  case 'HEAPU32': {
-            return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_INT), ['num', 4], ['num', 0]]];
-          }
-          case 'HEAPF32': {
-            return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_DOUBLE), ['num', 4], ['num', 1]]];
-          }
-          case 'HEAPF64': {
-            return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_DOUBLE), ['num', 8], ['num', 1]]];
-          }
-          default: throw 'bad heap ' + heap;
-        }
-      }
-    } else if (type === 'sub') {
-      var target = node[1][1];
-      if (target[0] === 'H') {
-        // heap access
-        var heap = target;
-        var ptr = fixPtr(node[2], heap);
-        // SAFE_HEAP_LOAD(ptr, bytes, isFloat) 
-        switch (heap) {
-          case 'HEAP8': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 1], ['num', 0], ['num', 0]]], ASM_INT);
-          }
-          case 'HEAPU8': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 1], ['num', 0], ['num', 1]]], ASM_INT);
-          }
-          case 'HEAP16': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 2], ['num', 0], ['num', 0]]], ASM_INT);
-          }
-          case 'HEAPU16': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 2], ['num', 0], ['num', 1]]], ASM_INT);
-          }
-          case 'HEAP32': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 4], ['num', 0], ['num', 0]]], ASM_INT);
-          }
-          case 'HEAPU32': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 4], ['num', 0], ['num', 1]]], ASM_INT);
-          }
-          case 'HEAPF32': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 4], ['num', 1], ['num', 0]]], ASM_DOUBLE);
-          }
-          case 'HEAPF64': {
-            return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 8], ['num', 1], ['num', 0]]], ASM_DOUBLE);
+  var SAFE_HEAP_FUNCS = set('SAFE_HEAP_LOAD', 'SAFE_HEAP_LOAD_D', 'SAFE_HEAP_STORE', 'SAFE_HEAP_STORE_D', 'SAFE_FT_MASK');
+  traverseGeneratedFunctions(ast, function(func) {
+    if (func[1] in SAFE_HEAP_FUNCS) return null;
+    traverseGenerated(func, function(node, type) {
+      if (type === 'assign') {
+        if (node[1] === true && node[2][0] === 'sub') {
+          var heap = node[2][1][1];
+          var ptr = fixPtr(node[2][2], heap);
+          var value = node[3];
+          // SAFE_HEAP_STORE(ptr, value, bytes, isFloat) 
+          switch (heap) {
+            case 'HEAP8':   case 'HEAPU8': {
+              return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_INT), ['num', 1]]];
+            }
+            case 'HEAP16':  case 'HEAPU16': {
+              return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_INT), ['num', 2]]];
+            }
+            case 'HEAP32':  case 'HEAPU32': {
+              return ['call', ['name', 'SAFE_HEAP_STORE'], [ptr, makeAsmCoercion(value, ASM_INT), ['num', 4]]];
+            }
+            case 'HEAPF32': {
+              return ['call', ['name', 'SAFE_HEAP_STORE_D'], [ptr, makeAsmCoercion(value, ASM_DOUBLE), ['num', 4]]];
+            }
+            case 'HEAPF64': {
+              return ['call', ['name', 'SAFE_HEAP_STORE_D'], [ptr, makeAsmCoercion(value, ASM_DOUBLE), ['num', 8]]];
+            }
+            default: throw 'bad heap ' + heap;
+          }
+        }
+      } else if (type === 'sub') {
+        var target = node[1][1];
+        if (target[0] === 'H') {
+          // heap access
+          var heap = target;
+          var ptr = fixPtr(node[2], heap);
+          // SAFE_HEAP_LOAD(ptr, bytes, isFloat) 
+          switch (heap) {
+            case 'HEAP8': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 1], ['num', 0]]], ASM_INT);
+            }
+            case 'HEAPU8': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 1], ['num', 1]]], ASM_INT);
+            }
+            case 'HEAP16': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 2], ['num', 0]]], ASM_INT);
+            }
+            case 'HEAPU16': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 2], ['num', 1]]], ASM_INT);
+            }
+            case 'HEAP32': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 4], ['num', 0]]], ASM_INT);
+            }
+            case 'HEAPU32': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD'], [ptr, ['num', 4], ['num', 1]]], ASM_INT);
+            }
+            case 'HEAPF32': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD_D'], [ptr, ['num', 4]]], ASM_DOUBLE);
+            }
+            case 'HEAPF64': {
+              return makeAsmCoercion(['call', ['name', 'SAFE_HEAP_LOAD_D'], [ptr, ['num', 8]]], ASM_DOUBLE);
+            }
+            default: throw 'bad heap ' + heap;
           }
-          default: throw 'bad heap ' + heap;
+        } else {
+          assert(target[0] == 'F');
+          // function table indexing mask
+          assert(node[2][0] === 'binary' && node[2][1] === '&');
+          node[2][2] = makeAsmCoercion(['call', ['name', 'SAFE_FT_MASK'], [makeAsmCoercion(node[2][2], ASM_INT), makeAsmCoercion(node[2][3], ASM_INT)]], ASM_INT);
         }
-      } else {
-        assert(target[0] == 'F');
-        // function table indexing mask
-        assert(node[2][0] === 'binary' && node[2][1] === '&');
-        node[2][2] = makeAsmCoercion(['call', ['name', 'SAFE_FT_MASK'], [makeAsmCoercion(node[2][2], ASM_INT), makeAsmCoercion(node[2][3], ASM_INT)]], ASM_INT);
       }
-    }
+    });
   });
 }
 
diff --git a/tools/lz4-compress.js b/tools/lz4-compress.js
index cc4fa466b0521..b64cdf4f22118 100644
--- a/tools/lz4-compress.js
+++ b/tools/lz4-compress.js
@@ -143,8 +143,10 @@ if (!(data instanceof ArrayBuffer)) {
   data = new Uint8Array(data).buffer;
 }
 
+var start = Date.now();
 var compressedData = MiniLZ4.compressPackage(data);
 nodeFS['writeFileSync'](output, Buffer(compressedData.data));
 compressedData.data = null;
+printErr('compressed in ' + (Date.now() - start) + ' ms');
 print(JSON.stringify(compressedData));
 
diff --git a/tools/nativize_llvm.py b/tools/nativize_llvm.py
index 52dfdea1aebd5..8333515761ae4 100755
--- a/tools/nativize_llvm.py
+++ b/tools/nativize_llvm.py
@@ -31,6 +31,11 @@ def path_from_root(*pathelems):
     Popen(['as', filename + '.s', '-o', filename + '.o']).communicate()[0]
     if os.path.exists(filename + '.o'): break
   if os.path.exists(filename + '.o'): break
+
+if not os.path.exists(filename + '.o'):
+  print >> sys.stderr, 'tools/nativize_llvm.py: Failed to convert "' + filename + '" to "' + filename + '.o"!'
+  sys.exit(1)
+
 print 'o => runnable'
 Popen(['g++', path_from_root('system', 'lib', 'debugging.cpp'), filename + '.o', '-o', filename + '.run'] + ['-l' + lib for lib in libs]).communicate()[0]
 
diff --git a/tools/optimizer/CMakeLists.txt b/tools/optimizer/CMakeLists.txt
index 3f62438f9f5cc..d393cdeb8d011 100644
--- a/tools/optimizer/CMakeLists.txt
+++ b/tools/optimizer/CMakeLists.txt
@@ -11,9 +11,11 @@ else()
 	set(IS_GCC_LIKE FALSE)
 endif()
 
-# -DPROFILING will print crude timing information to stderr for initial
-# identification of areas to profile in more depth with
+# -DCMAKE_CXX_FLAGS=-DPROFILING will print crude timing information to stderr
+# for initial identification of areas to profile in more depth with
 # CALLGRIND_{START,STOP}_INSTRUMENTATION or similar
+# Don't forget to also pass -DCMAKE_BUILD_TYPE=Release to cmake or your build
+# won't be optimized by the compiler!
 
 if (IS_GCC_LIKE)
 	set(cFlags "-std=c++11 -fno-exceptions -fno-rtti")
diff --git a/tools/optimizer/optimizer-main.cpp b/tools/optimizer/optimizer-main.cpp
index c2fd67e46a1ba..02b7fe17f02fd 100644
--- a/tools/optimizer/optimizer-main.cpp
+++ b/tools/optimizer/optimizer-main.cpp
@@ -1,3 +1,13 @@
+//==============================================================================
+// Optimizer tool. This is meant to be run after the emscripten compiler has
+// finished generating code. These optimizations are done on the generated
+// code to further improve it.
+//
+// Be aware that this is *not* a general JS optimizer. It assumes that the
+// input is valid asm.js and makes strong assumptions based on this. It may do
+// anything from crashing to optimizing incorrectly if the input is not valid!
+//==============================================================================
+
 #include "simple_ast.h"
 #include "optimizer.h"
 
@@ -15,6 +25,12 @@ int main(int argc, char **argv) {
     else if (str == "last") last = true;
   }
 
+#ifdef PROFILING
+    std::string str("reading and parsing");
+    clock_t start = clock();
+    errv("starting %s", str.c_str());
+#endif
+
   // Read input file
   FILE *f = fopen(argv[1], "r");
   assert(f);
@@ -49,6 +65,10 @@ int main(int argc, char **argv) {
   }
   // do not free input, its contents are used as strings
 
+#ifdef PROFILING
+    errv("    %s took %lu milliseconds", str.c_str(), (clock() - start)/1000);
+#endif
+
   // Run passes on the Document
   for (int i = 2; i < argc; i++) {
     std::string str(argv[i]);
@@ -78,7 +98,7 @@ int main(int argc, char **argv) {
       abort();
     }
 #ifdef PROFILING
-    errv("    %s took %lu microseconds", str.c_str(), clock() - start);
+    errv("    %s took %lu milliseconds", str.c_str(), (clock() - start)/1000);
 #endif
 #ifdef DEBUGGING
     if (worked) {
diff --git a/tools/optimizer/optimizer.cpp b/tools/optimizer/optimizer.cpp
index 043478a8cf8a5..378542e76be73 100644
--- a/tools/optimizer/optimizer.cpp
+++ b/tools/optimizer/optimizer.cpp
@@ -105,7 +105,8 @@ AsmType detectType(Ref node, AsmData *asmData=nullptr, bool inVarDef=false);
 Ref makeEmpty();
 bool isEmpty(Ref node);
 Ref makeAsmVarDef(const IString& v, AsmType type);
-Ref makeArray();
+Ref makeArray(int size_hint);
+Ref makeBool(bool b);
 Ref makeNum(double x);
 Ref makeName(IString str);
 Ref makeAsmCoercion(Ref node, AsmType type);
@@ -167,7 +168,8 @@ struct AsmData {
       stats[i] = makeEmpty();
       i++;
     }
-    // process initial variable definitions
+    // process initial variable definitions and remove '= 0' etc parts - these
+    // are not actually assignments in asm.js
     while (i < stats->size()) {
       Ref node = stats[i];
       if (node[0] != VAR) break;
@@ -193,7 +195,7 @@ struct AsmData {
         Ref type = node[0];
         if (type == VAR) {
           dump("bad, seeing a var in need of fixing", func);
-          assert(0); //, 'should be no vars to fix! ' + func[1] + ' : ' + JSON.stringify(node));
+          abort(); //, 'should be no vars to fix! ' + func[1] + ' : ' + JSON.stringify(node));
         }
       });
       i++;
@@ -218,7 +220,7 @@ struct AsmData {
       }
     }
     // calculate variable definitions
-    Ref varDefs = makeArray();
+    Ref varDefs = makeArray(vars.size());
     for (auto v : vars) {
       varDefs->push_back(makeAsmVarDef(v, locals[v].type));
     }
@@ -240,7 +242,7 @@ struct AsmData {
     for (auto param : func[2]->getArray()) {
       IString str = param->getIString();
       assert(locals.count(str) > 0);
-      stats[next++] = make1(STAT, make3(ASSIGN, &(arena.alloc())->setBool(true), makeName(str.c_str()), makeAsmCoercion(makeName(str.c_str()), locals[str].type)));
+      stats[next++] = make1(STAT, make3(ASSIGN, makeBool(true), makeName(str.c_str()), makeAsmCoercion(makeName(str.c_str()), locals[str].type)));
     }
     if (varDefs->size()) {
       stats[next] = make1(VAR, varDefs);
@@ -402,8 +404,12 @@ AsmType detectType(Ref node, AsmData *asmData, bool inVarDef) {
 
 // Constructions TODO: share common constructions, and assert they remain frozen
 
-Ref makeArray() {
-  return &arena.alloc()->setArray();
+Ref makeArray(int size_hint=0) {
+  return &arena.alloc()->setArray(size_hint);
+}
+
+Ref makeBool(bool b) {
+  return &arena.alloc()->setBool(b);
 }
 
 Ref makeString(const IString& s) {
@@ -411,65 +417,46 @@ Ref makeString(const IString& s) {
 }
 
 Ref makeEmpty() {
-  Ref ret(makeArray());
-  ret->push_back(makeString(TOPLEVEL));
-  ret->push_back(makeArray());
-  return ret;
-}
-
-Ref makePair(Ref x, Ref y) {
-  Ref ret = makeArray();
-  ret->push_back(x);
-  ret->push_back(y);
-  return ret;
+  return ValueBuilder::makeToplevel();
 }
 
 Ref makeNum(double x) {
-  Ref ret(makeArray());
-  ret->push_back(makeString(NUM));
-  ret->push_back(&arena.alloc()->setNumber(x));
-  return ret;
+  return ValueBuilder::makeDouble(x);
 }
 
 Ref makeName(IString str) {
-  Ref ret(makeArray());
-  ret->push_back(makeString(NAME));
-  ret->push_back(makeString(str));
-  return ret;
+  return ValueBuilder::makeName(str);
 }
 
 Ref makeBlock() {
-  Ref ret(makeArray());
-  ret->push_back(makeString(BLOCK));
-  ret->push_back(makeArray());
-  return ret;
+  return ValueBuilder::makeBlock();
 }
 
-Ref make1(IString type, Ref a) {
-  Ref ret(makeArray());
-  ret->push_back(makeString(type));
+Ref make1(IString s1, Ref a) {
+  Ref ret(makeArray(2));
+  ret->push_back(makeString(s1));
   ret->push_back(a);
   return ret;
 }
 
-Ref make2(IString type, IString a, Ref b) {
-  Ref ret(makeArray());
-  ret->push_back(makeString(type));
-  ret->push_back(makeString(a));
-  ret->push_back(b);
+Ref make2(IString s1, IString s2, Ref a) {
+  Ref ret(makeArray(2));
+  ret->push_back(makeString(s1));
+  ret->push_back(makeString(s2));
+  ret->push_back(a);
   return ret;
 }
 
-Ref make2(IString type, Ref a, Ref b) {
-  Ref ret(makeArray());
-  ret->push_back(makeString(type));
+Ref make2(IString s1, Ref a, Ref b) {
+  Ref ret(makeArray(3));
+  ret->push_back(makeString(s1));
   ret->push_back(a);
   ret->push_back(b);
   return ret;
 }
 
 Ref make3(IString type, IString a, Ref b, Ref c) {
-  Ref ret(makeArray());
+  Ref ret(makeArray(4));
   ret->push_back(makeString(type));
   ret->push_back(makeString(a));
   ret->push_back(b);
@@ -478,7 +465,7 @@ Ref make3(IString type, IString a, Ref b, Ref c) {
 }
 
 Ref make3(IString type, Ref a, Ref b, Ref c) {
-  Ref ret(makeArray());
+  Ref ret(makeArray(4));
   ret->push_back(makeString(type));
   ret->push_back(a);
   ret->push_back(b);
@@ -495,45 +482,45 @@ Ref makeAsmVarDef(const IString& v, AsmType type) {
       if (!ASM_FLOAT_ZERO.isNull()) {
         val = makeName(ASM_FLOAT_ZERO);
       } else {
-        val = make2(CALL, makeName(MATH_FROUND), &(makeArray())->push_back(makeNum(0)));
+        val = make2(CALL, makeName(MATH_FROUND), &(makeArray(1))->push_back(makeNum(0)));
       }
       break;
     }
     case ASM_FLOAT32X4: {
-      val = make2(CALL, makeName(SIMD_FLOAT32X4), &(makeArray())->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
+      val = make2(CALL, makeName(SIMD_FLOAT32X4), &(makeArray(4))->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
       break;
     }
     case ASM_FLOAT64X2: {
-      val = make2(CALL, makeName(SIMD_FLOAT64X2), &(makeArray())->push_back(makeNum(0)).push_back(makeNum(0)));
+      val = make2(CALL, makeName(SIMD_FLOAT64X2), &(makeArray(2))->push_back(makeNum(0)).push_back(makeNum(0)));
       break;
     }
     case ASM_INT8X16: {
-      val = make2(CALL, makeName(SIMD_INT8X16), &(makeArray())->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
+      val = make2(CALL, makeName(SIMD_INT8X16), &(makeArray(16))->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
       break;
     }
     case ASM_INT16X8: {
-      val = make2(CALL, makeName(SIMD_INT16X8), &(makeArray())->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
+      val = make2(CALL, makeName(SIMD_INT16X8), &(makeArray(8))->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
       break;
     }
     case ASM_INT32X4: {
-      val = make2(CALL, makeName(SIMD_INT32X4), &(makeArray())->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
+      val = make2(CALL, makeName(SIMD_INT32X4), &(makeArray(4))->push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)).push_back(makeNum(0)));
       break;
     }
     default: assert(0);
   }
-  return makePair(&(arena.alloc()->setString(v)), val);
+  return make1(v, val);
 }
 
 Ref makeAsmCoercion(Ref node, AsmType type) {
   switch (type) {
     case ASM_INT: return make3(BINARY, OR, node, makeNum(0));
     case ASM_DOUBLE: return make2(UNARY_PREFIX, PLUS, node);
-    case ASM_FLOAT: return make2(CALL, makeName(MATH_FROUND), &(makeArray())->push_back(node));
-    case ASM_FLOAT32X4: return make2(CALL, makeName(SIMD_FLOAT32X4_CHECK), &(makeArray())->push_back(node));
-    case ASM_FLOAT64X2: return make2(CALL, makeName(SIMD_FLOAT64X2_CHECK), &(makeArray())->push_back(node));
-    case ASM_INT8X16: return make2(CALL, makeName(SIMD_INT8X16_CHECK), &(makeArray())->push_back(node));
-    case ASM_INT16X8: return make2(CALL, makeName(SIMD_INT16X8_CHECK), &(makeArray())->push_back(node));
-    case ASM_INT32X4: return make2(CALL, makeName(SIMD_INT32X4_CHECK), &(makeArray())->push_back(node));
+    case ASM_FLOAT: return make2(CALL, makeName(MATH_FROUND), &(makeArray(1))->push_back(node));
+    case ASM_FLOAT32X4: return make2(CALL, makeName(SIMD_FLOAT32X4_CHECK), &(makeArray(1))->push_back(node));
+    case ASM_FLOAT64X2: return make2(CALL, makeName(SIMD_FLOAT64X2_CHECK), &(makeArray(1))->push_back(node));
+    case ASM_INT8X16: return make2(CALL, makeName(SIMD_INT8X16_CHECK), &(makeArray(1))->push_back(node));
+    case ASM_INT16X8: return make2(CALL, makeName(SIMD_INT16X8_CHECK), &(makeArray(1))->push_back(node));
+    case ASM_INT32X4: return make2(CALL, makeName(SIMD_INT32X4_CHECK), &(makeArray(1))->push_back(node));
     case ASM_NONE:
     default: return node; // non-validating code, emit nothing XXX this is dangerous, we should only allow this when we know we are not validating
   }
@@ -730,22 +717,22 @@ void removeAllUselessSubNodes(Ref ast) {
 }
 
 Ref unVarify(Ref vars) { // transform var x=1, y=2 etc. into (x=1, y=2), i.e., the same assigns, but without a var definition
-  Ref ret = makeArray();
+  Ref ret = makeArray(1);
   ret->push_back(makeString(STAT));
   if (vars->size() == 1) {
-    ret->push_back(make3(ASSIGN, &(arena.alloc())->setBool(true), makeName(vars[0][0]->getIString()), vars[0][1]));
+    ret->push_back(make3(ASSIGN, makeBool(true), makeName(vars[0][0]->getIString()), vars[0][1]));
   } else {
-    ret->push_back(makeArray());
+    ret->push_back(makeArray(vars->size()-1));
     Ref curr = ret[1];
     for (size_t i = 0; i+1 < vars->size(); i++) {
       curr->push_back(makeString(SEQ));
-      curr->push_back(make3(ASSIGN, &(arena.alloc())->setBool(true), makeName(vars[i][0]->getIString()), vars[i][1]));
+      curr->push_back(make3(ASSIGN, makeBool(true), makeName(vars[i][0]->getIString()), vars[i][1]));
       if (i != vars->size()-2) {
         curr->push_back(makeArray());
         curr = curr[2];
       }
     }
-    curr->push_back(make3(ASSIGN, &(arena.alloc())->setBool(true), makeName(vars->back()[0]->getIString()), vars->back()[1]));
+    curr->push_back(make3(ASSIGN, makeBool(true), makeName(vars->back()[0]->getIString()), vars->back()[1]));
   }
   return ret;
 }
@@ -907,7 +894,7 @@ Ref simplifyCondition(Ref node) {
 // In memSafe mode, we are more careful and assume functions can replace HEAP and FUNCTION_TABLE, which
 // can happen in ALLOW_MEMORY_GROWTH mode
 
-StringSet ELIMINATION_SAFE_NODES("var assign call if toplevel do return label switch binary unary-prefix"); // do is checked carefully, however
+StringSet ELIMINATION_SAFE_NODES("assign call if toplevel do return label switch binary unary-prefix"); // do is checked carefully, however
 StringSet IGNORABLE_ELIMINATOR_SCAN_NODES("num toplevel string break continue dot"); // dot can only be STRING_TABLE.*
 StringSet ABORTING_ELIMINATOR_SCAN_NODES("new object function defun for while array throw"); // we could handle some of these, TODO, but nontrivial (e.g. for while, the condition is hit multiple times after the body)
 
@@ -939,10 +926,30 @@ class StringTypeMap : public std::unordered_map<IString, AsmType> {
 };
 
 void eliminate(Ref ast, bool memSafe) {
+#ifdef PROFILING
+  clock_t tasmdata = 0;
+  clock_t tfnexamine = 0;
+  clock_t tvarcheck = 0;
+  clock_t tstmtelim = 0;
+  clock_t tstmtscan = 0;
+  clock_t tcleanvars = 0;
+  clock_t treconstruct = 0;
+#endif
+
   // Find variables that have a single use, and if they can be eliminated, do so
-  traverseFunctions(ast, [&memSafe](Ref func) {
+  traverseFunctions(ast, [&](Ref func) {
+
+#ifdef PROFILING
+    clock_t start = clock();
+#endif
+
     AsmData asmData(func);
 
+#ifdef PROFILING
+    tasmdata += clock() - start;
+    start = clock();
+#endif
+
     // First, find the potentially eliminatable functions: that have one definition and one use
 
     StringIntMap definitions;
@@ -956,38 +963,28 @@ void eliminate(Ref ast, bool memSafe) {
     // examine body and note locals
     traversePre(func, [&](Ref node) {
       Ref type = node[0];
-      if (type == VAR) {
-        Ref node1 = node[1];
-        for (size_t i = 0; i < node1->size(); i++) {
-          Ref node1i = node1[i];
-          IString name = node1i[0]->getIString();
-          Ref value;
-          if (node1i->size() > 1 && !!(value = node1i[1])) {
-            definitions[name]++;
-            if (!values.has(name)) values[name] = value;
-          }
-          uses[name];
-        }
-      } else if (type == NAME) {
+      if (type == NAME) {
         IString& name = node[1]->getIString();
-        uses[name]++;// = uses[name] + 1;
+        uses[name]++;
+        namings[name]++;
       } else if (type == ASSIGN) {
         Ref target = node[2];
         if (target[0] == NAME) {
           IString& name = target[1]->getIString();
-          definitions[name]++;//= definitions[name] + 1;
-          uses[name]; // zero if not there already
-          if (!values.has(name)) values[name] = node[3];
+          // values is only used if definitions is 1
+          if (definitions[name]++ == 0) {
+            values[name] = node[3];
+          }
           assert(node[1]->isBool(true)); // not +=, -= etc., just =
           uses[name]--; // because the name node will show up by itself in the previous case
-          namings[name]++;// = namings[name] + 1; // offset it here, this tracks the total times we are named
         }
       }
     });
 
-    for (auto used : uses) {
-      namings[used.first] += used.second;
-    }
+#ifdef PROFILING
+    tfnexamine += clock() - start;
+    start = clock();
+#endif
 
     StringSet potentials; // local variables with 1 definition and 1 use
     StringSet sideEffectFree; // whether a local variable has no side effects in its definition. Only relevant when there are no uses
@@ -1050,6 +1047,11 @@ void eliminate(Ref ast, bool memSafe) {
       processVariable(name.first);
     }
 
+#ifdef PROFILING
+    tvarcheck += clock() - start;
+    start = clock();
+#endif
+
     //printErr('defs: ' + JSON.stringify(definitions));
     //printErr('uses: ' + JSON.stringify(uses));
     //printErr('values: ' + JSON.stringify(values));
@@ -1060,9 +1062,8 @@ void eliminate(Ref ast, bool memSafe) {
     //printErr('potentials: ' + JSON.stringify(potentials));
     // We can now proceed through the function. In each list of statements, we try to eliminate
     struct Tracking {
-      bool usesGlobals, usesMemory;
+      bool usesGlobals, usesMemory, hasDeps;
       Ref defNode;
-      StringSet deps;
       bool doesCall;
     };
     class Tracked : public std::unordered_map<IString, Tracking> {
@@ -1071,6 +1072,8 @@ void eliminate(Ref ast, bool memSafe) {
     };
     Tracked tracked;
     #define dumpTracked() { errv("tracking %d", tracked.size()); for (auto t : tracked) errv("... %s", t.first.c_str()); }
+    // Although a set would be more appropriate, it would also be slower
+    std::unordered_map<IString, StringVec> depMap;
 
     bool globalsInvalidated = false; // do not repeat invalidations, until we track something new
     bool memoryInvalidated = false;
@@ -1079,6 +1082,7 @@ void eliminate(Ref ast, bool memSafe) {
       Tracking& track = tracked[name];
       track.usesGlobals = false;
       track.usesMemory = false;
+      track.hasDeps = false;
       track.defNode = defNode;
       track.doesCall = false;
       bool ignoreName = false; // one-time ignorings of names, as first op in sub and call
@@ -1086,12 +1090,13 @@ void eliminate(Ref ast, bool memSafe) {
         Ref type = node[0];
         if (type == NAME) {
           if (!ignoreName) {
-            IString name = node[1]->getIString();
-            if (!asmData.isLocal(name)) {
+            IString depName = node[1]->getIString();
+            if (!asmData.isLocal(depName)) {
               track.usesGlobals = true;
             }
-            if (!potentials.has(name)) { // deps do not matter for potentials - they are defined once, so no complexity
-              track.deps.insert(name);
+            if (!potentials.has(depName)) { // deps do not matter for potentials - they are defined once, so no complexity
+              depMap[depName].push_back(name);
+              track.hasDeps = true;
             }
           } else {
             ignoreName = false;
@@ -1102,15 +1107,15 @@ void eliminate(Ref ast, bool memSafe) {
         } else if (type == CALL) {
           track.usesGlobals = true;
           track.usesMemory = true;
-          track.doesCall = true;        
+          track.doesCall = true;
           ignoreName = true;
         } else {
           ignoreName = false;
         }
       });
-      globalsInvalidated = false;
-      memoryInvalidated = false;
-      callsInvalidated = false;
+      if (track.usesGlobals) globalsInvalidated = false;
+      if (track.usesMemory) memoryInvalidated = false;
+      if (track.doesCall) callsInvalidated = false;
     };
 
     // TODO: invalidate using a sequence number for each type (if you were tracked before the last invalidation, you are cancelled). remove for.in loops
@@ -1133,17 +1138,10 @@ void eliminate(Ref ast, bool memSafe) {
     INVALIDATE(Calls, info.doesCall);
 
     auto invalidateByDep = [&](IString dep) {
-      std::vector<IString> temp;
-      for (auto t : tracked) {
-        IString name = t.first;
-        Tracking& info = tracked[name];
-        if (info.deps.has(dep)) {
-          temp.push_back(name);
-        }
-      }
-      for (size_t i = 0; i < temp.size(); i++) {
-        tracked.erase(temp[i]);
+      for (auto name : depMap[dep]) {
+        tracked.erase(name);
       }
+      depMap.erase(dep);
     };
 
     std::function<void (IString name, Ref node)> doEliminate;
@@ -1153,40 +1151,42 @@ void eliminate(Ref ast, bool memSafe) {
     auto scan = [&](Ref node) {
       bool abort = false;
       bool allowTracking = true; // false inside an if; also prevents recursing in an if
-      std::function<void (Ref, bool, bool)> traverseInOrder = [&](Ref node, bool ignoreSub, bool ignoreName) {
+      std::function<void (Ref, bool)> traverseInOrder = [&](Ref node, bool ignoreSub) {
         if (abort) return;
         Ref type = node[0];
         if (type == ASSIGN) {
           Ref target = node[2];
           Ref value = node[3];
           bool nameTarget = target[0] == NAME;
-          traverseInOrder(target, true,  nameTarget); // evaluate left
-          traverseInOrder(value,  false, false); // evaluate right
+          // If this is an assign to a name, handle it below rather than
+          // traversing and treating as a read
+          if (!nameTarget) {
+            traverseInOrder(target, true); // evaluate left
+          }
+          traverseInOrder(value,  false); // evaluate right
           // do the actual assignment
           if (nameTarget) {
             IString name = target[1]->getIString();
-            if (!potentials.has(name)) {
-              if (!varsToTryToRemove.has(name)) {
-                // expensive check for invalidating specific tracked vars. This list is generally quite short though, because of
-                // how we just eliminate in short spans and abort when control flow happens TODO: history numbers instead
-                invalidateByDep(name); // can happen more than once per dep..
-                if (!asmData.isLocal(name) && !globalsInvalidated) {
-                  invalidateGlobals();
-                  globalsInvalidated = true;
-                }
-                // if we can track this name (that we assign into), and it has 0 uses and we want to remove its VAR
-                // definition - then remove it right now, there is no later chance
-                if (allowTracking && varsToRemove.has(name) && uses[name] == 0) {
-                  track(name, node[3], node);
-                  doEliminate(name, node);
-                }
-              } else {
-                // replace it in-place
-                safeCopy(node, value);
-                varsToRemove[name] = 2;
-              }
+            if (potentials.has(name) && allowTracking) {
+              track(name, node[3], node);
+            } else if (varsToTryToRemove.has(name)) {
+              // replace it in-place
+              safeCopy(node, value);
+              varsToRemove[name] = 2;
             } else {
-              if (allowTracking) track(name, node[3], node);
+              // expensive check for invalidating specific tracked vars. This list is generally quite short though, because of
+              // how we just eliminate in short spans and abort when control flow happens TODO: history numbers instead
+              invalidateByDep(name); // can happen more than once per dep..
+              if (!asmData.isLocal(name) && !globalsInvalidated) {
+                invalidateGlobals();
+                globalsInvalidated = true;
+              }
+              // if we can track this name (that we assign into), and it has 0 uses and we want to remove its VAR
+              // definition - then remove it right now, there is no later chance
+              if (allowTracking && varsToRemove.has(name) && uses[name] == 0) {
+                track(name, node[3], node);
+                doEliminate(name, node);
+              }
             }
           } else if (target[0] == SUB) {
             if (isTempDoublePtrAccess(target)) {
@@ -1200,8 +1200,12 @@ void eliminate(Ref ast, bool memSafe) {
             }
           }
         } else if (type == SUB) {
-          traverseInOrder(node[1], false, !memSafe); // evaluate inner
-          traverseInOrder(node[2], false, false); // evaluate outer
+          // Only keep track of the global array names in memsafe mode i.e.
+          // when they may change underneath us due to resizing
+          if (node[1][0] != NAME || memSafe) {
+            traverseInOrder(node[1], false); // evaluate inner
+          }
+          traverseInOrder(node[2], false); // evaluate outer
           // ignoreSub means we are a write (happening later), not a read
           if (!ignoreSub && !isTempDoublePtrAccess(node)) {
             // do the memory access
@@ -1210,26 +1214,6 @@ void eliminate(Ref ast, bool memSafe) {
               callsInvalidated = true;
             }
           }
-        } else if (type == VAR) {
-          Ref vars = node[1];
-          for (size_t i = 0; i < vars->size(); i++) {
-            IString name = vars[i][0]->getIString();
-            Ref value;
-            if (vars[i]->size() > 1 && !!(value = vars[i][1])) {
-              traverseInOrder(value, false, false);
-              if (potentials.has(name) && allowTracking) {
-                track(name, value, node);
-              } else {
-                invalidateByDep(name);
-              }
-              if (vars->size() == 1 && varsToTryToRemove.has(name) && !!value) {
-                // replace it in-place
-                value = make1(STAT, value);
-                safeCopy(node, value);
-                varsToRemove[name] = 2;
-              }
-            }
-          }
         } else if (type == BINARY) {
           bool flipped = false;
           if (ASSOCIATIVE_BINARIES.has(node[1]) && !NAME_OR_NUM.has(node[2][0]) && NAME_OR_NUM.has(node[3][0])) { // TODO recurse here?
@@ -1239,31 +1223,32 @@ void eliminate(Ref ast, bool memSafe) {
             node[3] = temp;
             flipped = true;
           }
-          traverseInOrder(node[2], false, false);
-          traverseInOrder(node[3], false, false);
+          traverseInOrder(node[2], false);
+          traverseInOrder(node[3], false);
           if (flipped && NAME_OR_NUM.has(node[2][0])) { // dunno if we optimized, but safe to flip back - and keeps the code closer to the original and more readable
             Ref temp = node[2];
             node[2] = node[3];
             node[3] = temp;
           }
         } else if (type == NAME) {
-          if (!ignoreName) { // ignoreName means we are the name of something like a call or a sub - irrelevant for us
-            IString name = node[1]->getIString();
-            if (tracked.has(name)) {
-              doEliminate(name, node);
-            } else if (!asmData.isLocal(name) && !callsInvalidated) {
-              invalidateCalls();
-              callsInvalidated = true;
-            }
+          IString name = node[1]->getIString();
+          if (tracked.has(name)) {
+            doEliminate(name, node);
+          } else if (!asmData.isLocal(name) && !callsInvalidated) {
+            invalidateCalls();
+            callsInvalidated = true;
           }
         } else if (type == UNARY_PREFIX || type == UNARY_POSTFIX) {
-          traverseInOrder(node[2], false, false);
+          traverseInOrder(node[2], false);
         } else if (IGNORABLE_ELIMINATOR_SCAN_NODES.has(type)) {
         } else if (type == CALL) {
-          traverseInOrder(node[1], false, true);
+          // Named functions never change and are therefore safe to not track
+          if (node[1][0] != NAME) {
+            traverseInOrder(node[1], false);
+          }
           Ref args = node[2];
           for (size_t i = 0; i < args->size(); i++) {
-            traverseInOrder(args[i], false, false);
+            traverseInOrder(args[i], false);
           }
           if (callHasSideEffects(node)) {
             // these two invalidations will also invalidate calls
@@ -1278,14 +1263,14 @@ void eliminate(Ref ast, bool memSafe) {
           }
         } else if (type == IF) {
           if (allowTracking) {
-            traverseInOrder(node[1], false, false); // can eliminate into condition, but nowhere else
+            traverseInOrder(node[1], false); // can eliminate into condition, but nowhere else
             if (!callsInvalidated) { // invalidate calls, since we cannot eliminate them into an if that may not execute!
               invalidateCalls();
               callsInvalidated = true;
             }
             allowTracking = false;
-            traverseInOrder(node[2], false, false); // 2 and 3 could be 'parallel', really..
-            if (!!node[3]) traverseInOrder(node[3], false, false);
+            traverseInOrder(node[2], false); // 2 and 3 could be 'parallel', really..
+            if (!!node[3]) traverseInOrder(node[3], false);
             allowTracking = true;
           } else {
             tracked.clear();
@@ -1294,34 +1279,34 @@ void eliminate(Ref ast, bool memSafe) {
           Ref stats = getStatements(node);
           if (!!stats) {
             for (size_t i = 0; i < stats->size(); i++) {
-              traverseInOrder(stats[i], false, false);
+              traverseInOrder(stats[i], false);
             }
           }
         } else if (type == STAT) {
-          traverseInOrder(node[1], false, false);
+          traverseInOrder(node[1], false);
         } else if (type == LABEL) {
-          traverseInOrder(node[2], false, false);
+          traverseInOrder(node[2], false);
         } else if (type == SEQ) {
-          traverseInOrder(node[1], false, false);
-          traverseInOrder(node[2], false, false);
+          traverseInOrder(node[1], false);
+          traverseInOrder(node[2], false);
         } else if (type == DO) {
           if (node[1][0] == NUM && node[1][1]->getNumber() == 0) { // one-time loop
-            traverseInOrder(node[2], false, false);
+            traverseInOrder(node[2], false);
           } else {
             tracked.clear();
           }
         } else if (type == RETURN) {
-          if (!!node[1]) traverseInOrder(node[1], false, false);
+          if (!!node[1]) traverseInOrder(node[1], false);
         } else if (type == CONDITIONAL) {
           if (!callsInvalidated) { // invalidate calls, since we cannot eliminate them into a branch of an LLVM select/JS conditional that does not execute
             invalidateCalls();
             callsInvalidated = true;
           }
-          traverseInOrder(node[1], false, false);
-          traverseInOrder(node[2], false, false);
-          traverseInOrder(node[3], false, false);
+          traverseInOrder(node[1], false);
+          traverseInOrder(node[2], false);
+          traverseInOrder(node[3], false);
         } else if (type == SWITCH) {
-          traverseInOrder(node[1], false, false);
+          traverseInOrder(node[1], false);
           Tracked originalTracked = tracked;
           Ref cases = node[2];
           for (size_t i = 0; i < cases->size(); i++) {
@@ -1329,14 +1314,16 @@ void eliminate(Ref ast, bool memSafe) {
             assert(c[0]->isNull() || c[0][0] == NUM || (c[0][0] == UNARY_PREFIX && c[0][2][0] == NUM));
             Ref stats = c[1];
             for (size_t j = 0; j < stats->size(); j++) {
-              traverseInOrder(stats[j], false, false);
+              traverseInOrder(stats[j], false);
             }
-            // We cannot track from one switch case into another, undo all new trackings TODO: general framework here, use in if-else as well
+            // We cannot track from one switch case into another if there are external dependencies, undo all new trackings
+            // Otherwise we can track, e.g. a var used in a case before assignment in another case is UB in asm.js, so no need for the assignment
+            // TODO: general framework here, use in if-else as well
             std::vector<IString> toDelete;
             for (auto t : tracked) {
               if (!originalTracked.has(t.first)) {
                 Tracking& info = tracked[t.first];
-                if (info.usesGlobals || info.usesMemory || info.deps.size() > 0) {
+                if (info.usesGlobals || info.usesMemory || info.hasDeps) {
                   toDelete.push_back(t.first);
                 }
               }
@@ -1352,7 +1339,7 @@ void eliminate(Ref ast, bool memSafe) {
           abort = true;
         }
       };
-      traverseInOrder(node, false, false);
+      traverseInOrder(node, false);
     };
     //var eliminationLimit = 0; // used to debugging purposes
     doEliminate = [&](IString name, Ref node) {
@@ -1383,7 +1370,7 @@ void eliminate(Ref ast, bool memSafe) {
       // Look for statements, including while-switch pattern
       Ref stats = getStatements(block);
       if (!stats && (block[0] == WHILE && block[2][0] == SWITCH)) {
-        stats = &(makeArray()->push_back(block[2]));
+        stats = &(makeArray(1)->push_back(block[2]));
       }
       if (!stats) return;
       tracked.clear();
@@ -1395,13 +1382,28 @@ void eliminate(Ref ast, bool memSafe) {
         }
         // Check for things that affect elimination
         if (ELIMINATION_SAFE_NODES.has(type)) {
+#ifdef PROFILING
+          tstmtelim += clock() - start;
+          start = clock();
+#endif
           scan(node);
+#ifdef PROFILING
+          tstmtscan += clock() - start;
+          start = clock();
+#endif
+        } else if (type == VAR) {
+          continue; // asm normalisation has reduced 'var' to just the names
         } else {
           tracked.clear(); // not a var or assign, break all potential elimination so far
         }
       }
     });
 
+#ifdef PROFILING
+    tstmtelim += clock() - start;
+    start = clock();
+#endif
+
     StringIntMap seenUses;
     StringStringMap helperReplacements; // for looper-helper optimization
 
@@ -1571,7 +1573,7 @@ void eliminate(Ref ast, bool memSafe) {
                     traversePrePostConditional(curr, looperToLooptemp, [](Ref node){});
                   }
                   asmData.addVar(temp, asmData.getType(looper));
-                  stats->insert(found, make1(STAT, make3(ASSIGN, &(arena.alloc())->setBool(true), makeName(temp), makeName(looper))));
+                  stats->insert(found, make1(STAT, make3(ASSIGN, makeBool(true), makeName(temp), makeName(looper))));
                 }
               }
             }
@@ -1616,14 +1618,30 @@ void eliminate(Ref ast, bool memSafe) {
       }
     });
 
+#ifdef PROFILING
+    tcleanvars += clock() - start;
+    start = clock();
+#endif
+
     for (auto v : varsToRemove) {
       if (v.second == 2 && asmData.isVar(v.first)) asmData.deleteVar(v.first);
     }
 
     asmData.denormalize();
+
+#ifdef PROFILING
+    treconstruct += clock() - start;
+    start = clock();
+#endif
+
   });
 
   removeAllEmptySubNodes(ast);
+
+#ifdef PROFILING
+  errv("    EL stages: a:%li fe:%li vc:%li se:%li (ss:%li) cv:%li r:%li",
+    tasmdata, tfnexamine, tvarcheck, tstmtelim, tstmtscan, tcleanvars, treconstruct);
+#endif
 }
 
 void eliminateMemSafe(Ref ast) {
@@ -2127,7 +2145,7 @@ void simplifyIfs(Ref ast) {
               Ref curr = deStat(stats[i]);
               other[1] = make2(SEQ, curr, other[1]);
             }
-            Ref temp = makeArray();
+            Ref temp = makeArray(1);
             temp->push_back(other);
             stats = body[1] = temp;
           }
@@ -2293,7 +2311,7 @@ void registerize(Ref ast) {
       Ref assign = makeNum(0);
       // TODO: will be an isEmpty here, can reuse it.
       fun[3]->insert(0, make1(VAR, fun[2]->map([&assign](Ref param) {
-        return &(makeArray()->push_back(param).push_back(assign));
+        return &(makeArray(2)->push_back(param).push_back(assign));
       })));
     }
     // Replace all var definitions with assignments; we will add var definitions at the top after we registerize
@@ -4016,7 +4034,7 @@ void eliminateDeadFuncs(Ref ast) {
     }
     AsmData asmData(fun);
     fun[3]->setSize(1);
-    fun[3][0] = make1(STAT, make2(CALL, makeName(ABORT), &(makeArray())->push_back(makeNum(-1))));
+    fun[3][0] = make1(STAT, make2(CALL, makeName(ABORT), &(makeArray(1))->push_back(makeNum(-1))));
     asmData.vars.clear();
     asmData.denormalize();
   });
diff --git a/tools/optimizer/parser.h b/tools/optimizer/parser.h
index 61e072d6a1de8..9f5b6c3a765c2 100644
--- a/tools/optimizer/parser.h
+++ b/tools/optimizer/parser.h
@@ -134,7 +134,7 @@ template<class NodeRef, class Builder>
 class Parser {
 
   static bool isSpace(char x) { return x == 32 || x == 9 || x == 10 || x == 13; } /* space, tab, linefeed/newline, or return */
-  static char* skipSpace(char* curr) {
+  static void skipSpace(char*& curr) {
     while (*curr) {
       if (isSpace(*curr)) {
         curr++;
@@ -152,9 +152,8 @@ class Parser {
         curr += 2;
         continue;
       }
-      break;
+      return;
     }
-    return curr;
   }
 
   static bool isDigit(char x) { return x >= '0' && x <= '9'; }
@@ -193,7 +192,6 @@ class Parser {
     }
 
     explicit Frag(char* src) {
-      assert(!isSpace(*src));
       char *start = src;
       if (isIdentInit(*src)) {
         // read an identifier or a keyword
@@ -210,12 +208,6 @@ class Parser {
           *src = temp;
         }
         type = keywords.has(str) ? KEYWORD : IDENT;
-      } else if (*src == '"' || *src == '\'') {
-        char *end = strchr(src+1, *src);
-        *end = 0;
-        str.set(src+1);
-        src = end+1;
-        type = STRING;
       } else if (isDigit(*src) || (src[0] == '.' && isDigit(src[1]))) {
         if (src[0] == '0' && (src[1] == 'x' || src[1] == 'X')) {
           // Explicitly parse hex numbers of form "0x...", because strtod
@@ -259,8 +251,8 @@ class Parser {
           case '^': str = XOR; break;
           case '|': str = OR; break;
           case '~': str = B_NOT; break;
+          default: abort();
         }
-        assert(!str.isNull());
         size = strlen(str.str);
 #ifndef NDEBUG
         char temp = start[size];
@@ -277,6 +269,12 @@ class Parser {
         str.set(src, false);
         src[1] = temp;
         src++;
+      } else if (*src == '"' || *src == '\'') {
+        char *end = strchr(src+1, *src);
+        *end = 0;
+        str.set(src+1);
+        src = end+1;
+        type = STRING;
       } else {
         dump("frag parsing", src);
         abort();
@@ -285,23 +283,51 @@ class Parser {
     }
   };
 
+  struct ExpressionElement {
+    bool isNode;
+#ifndef _MSC_VER // MSVC does not allow unrestricted unions: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2544.pdf
+    union {
+#endif
+      NodeRef node;
+      IString op;
+#ifndef _MSC_VER
+    };
+#endif
+    ExpressionElement(NodeRef n) : isNode(true), node(n) {}
+    ExpressionElement(IString o) : isNode(false), op(o) {}
+
+    NodeRef getNode() {
+      assert(isNode);
+      return node;
+    }
+    IString getOp() {
+      assert(!isNode);
+      return op;
+    }
+  };
+
+  // This is a list of the current stack of node-operator-node-operator-etc.
+  // this works by each parseExpression call appending to the vector; then recursing out, and the toplevel sorts it all
+  typedef std::vector<ExpressionElement> ExpressionParts;
+  std::vector<ExpressionParts> expressionPartsStack;
+
   // Parses an element in a list of such elements, e.g. list of statements in a block, or list of parameters in a call
   NodeRef parseElement(char*& src, const char* seps=";") {
     //dump("parseElement", src);
-    src = skipSpace(src);
+    skipSpace(src);
     Frag frag(src);
     src += frag.size;
     switch (frag.type) {
       case KEYWORD: {
         return parseAfterKeyword(frag, src, seps);
       }
-      case IDENT:
+      case IDENT: {
+        return parseAfterIdent(frag, src, seps);
+      }
       case STRING:
       case INT:
       case DOUBLE: {
-        src = skipSpace(src);
-        if (frag.type == IDENT) return parseAfterIdent(frag, src, seps);
-        else return parseExpression(parseFrag(frag), src, seps);
+        return parseExpression(parseFrag(frag), src, seps);
       }
       case SEPARATOR: {
         if (frag.str == OPEN_PAREN) return parseExpression(parseAfterParen(src), src, seps);
@@ -329,24 +355,24 @@ class Parser {
   }
 
   NodeRef parseAfterKeyword(Frag& frag, char*& src, const char* seps) {
-    src = skipSpace(src);
-    if (frag.str == FUNCTION) return parseFunction(frag, src, seps);
-    else if (frag.str == VAR) return parseVar(frag, src, seps);
-    else if (frag.str == CONST) return parseVar(frag, src, seps);
-    else if (frag.str == RETURN) return parseReturn(frag, src, seps);
-    else if (frag.str == IF) return parseIf(frag, src, seps);
-    else if (frag.str == DO) return parseDo(frag, src, seps);
-    else if (frag.str == WHILE) return parseWhile(frag, src, seps);
-    else if (frag.str == BREAK) return parseBreak(frag, src, seps);
-    else if (frag.str == CONTINUE) return parseContinue(frag, src, seps);
-    else if (frag.str == SWITCH) return parseSwitch(frag, src, seps);
-    else if (frag.str == NEW) return parseNew(frag, src, seps);
+    skipSpace(src);
+    if (frag.str == FUNCTION) return parseFunction(src, seps);
+    else if (frag.str == VAR) return parseVar(src, seps, false);
+    else if (frag.str == CONST) return parseVar(src, seps, true);
+    else if (frag.str == RETURN) return parseReturn(src, seps);
+    else if (frag.str == IF) return parseIf(src, seps);
+    else if (frag.str == DO) return parseDo(src, seps);
+    else if (frag.str == WHILE) return parseWhile(src, seps);
+    else if (frag.str == BREAK) return parseBreak(src, seps);
+    else if (frag.str == CONTINUE) return parseContinue(src, seps);
+    else if (frag.str == SWITCH) return parseSwitch(src, seps);
+    else if (frag.str == NEW) return parseNew(src, seps);
     dump(frag.str.str, src);
     abort();
     return nullptr;
   }
 
-  NodeRef parseFunction(Frag& frag, char*& src, const char* seps) {
+  NodeRef parseFunction(char*& src, const char* seps) {
     Frag name(src);
     if (name.type == IDENT) {
       src += name.size;
@@ -355,75 +381,73 @@ class Parser {
       name.str = IString();
     }
     NodeRef ret = Builder::makeFunction(name.str);
-    src = skipSpace(src);
+    skipSpace(src);
     assert(*src == '(');
     src++;
     while (1) {
-      src = skipSpace(src);
+      skipSpace(src);
       if (*src == ')') break;
       Frag arg(src);
       assert(arg.type == IDENT);
       src += arg.size;
       Builder::appendArgumentToFunction(ret, arg.str);
-      src = skipSpace(src);
-      if (*src && *src == ')') break;
-      if (*src && *src == ',') {
+      skipSpace(src);
+      if (*src == ')') break;
+      if (*src == ',') {
         src++;
         continue;
       }
       abort();
     }
-    assert(*src == ')');
     src++;
-    parseBracketedBlock(src, ret);
+    Builder::setBlockContent(ret, parseBracketedBlock(src));
     // TODO: parse expression?
     return ret;
   }
 
-  NodeRef parseVar(Frag& frag, char*& src, const char* seps) {
-    NodeRef ret = Builder::makeVar(frag.str == CONST);
+  NodeRef parseVar(char*& src, const char* seps, bool is_const) {
+    NodeRef ret = Builder::makeVar(is_const);
     while (1) {
-      src = skipSpace(src);
+      skipSpace(src);
       if (*src == ';') break;
       Frag name(src);
       assert(name.type == IDENT);
       NodeRef value;
       src += name.size;
-      src = skipSpace(src);
+      skipSpace(src);
       if (*src == '=') {
         src++;
-        src = skipSpace(src);
+        skipSpace(src);
         value = parseElement(src, ";,");
       }
       Builder::appendToVar(ret, name.str, value);
-      src = skipSpace(src);
-      if (*src && *src == ';') break;
-      if (*src && *src == ',') {
+      skipSpace(src);
+      if (*src == ';') break;
+      if (*src == ',') {
         src++;
         continue;
       }
       abort();
     }
-    assert(*src == ';');
     src++;
     return ret;
   }
 
-  NodeRef parseReturn(Frag& frag, char*& src, const char* seps) {
-    src = skipSpace(src);
+  NodeRef parseReturn(char*& src, const char* seps) {
+    skipSpace(src);
     NodeRef value = !hasChar(seps, *src) ? parseElement(src, seps) : nullptr;
-    src = skipSpace(src);
+    skipSpace(src);
     assert(hasChar(seps, *src));
     if (*src == ';') src++;
     return Builder::makeReturn(value);
   }
 
-  NodeRef parseIf(Frag& frag, char*& src, const char* seps) {
+  NodeRef parseIf(char*& src, const char* seps) {
     NodeRef condition = parseParenned(src);
     NodeRef ifTrue = parseMaybeBracketed(src, seps);
-    src = skipSpace(src);
+    skipSpace(src);
     NodeRef ifFalse;
-    if (*src && !hasChar(seps, *src)) {
+    if (!hasChar(seps, *src)) {
       Frag next(src);
       if (next.type == KEYWORD && next.str == ELSE) {
         src += next.size;
@@ -433,9 +457,9 @@ class Parser {
     return Builder::makeIf(condition, ifTrue, ifFalse);
   }
 
-  NodeRef parseDo(Frag& frag, char*& src, const char* seps) {
+  NodeRef parseDo(char*& src, const char* seps) {
     NodeRef body = parseMaybeBracketed(src, seps);
-    src = skipSpace(src);
+    skipSpace(src);
     Frag next(src);
     assert(next.type == KEYWORD && next.str == WHILE);
     src += next.size;
@@ -443,40 +467,40 @@ class Parser {
     return Builder::makeDo(body, condition);
   }
 
-  NodeRef parseWhile(Frag& frag, char*& src, const char* seps) {
+  NodeRef parseWhile(char*& src, const char* seps) {
     NodeRef condition = parseParenned(src);
     NodeRef body = parseMaybeBracketed(src, seps);
     return Builder::makeWhile(condition, body);
   }
 
-  NodeRef parseBreak(Frag& frag, char*& src, const char* seps) {
-    src = skipSpace(src);
+  NodeRef parseBreak(char*& src, const char* seps) {
+    skipSpace(src);
     Frag next(src);
     if (next.type == IDENT) src += next.size;
     return Builder::makeBreak(next.type == IDENT ? next.str : IString());
   }
 
-  NodeRef parseContinue(Frag& frag, char*& src, const char* seps) {
-    src = skipSpace(src);
+  NodeRef parseContinue(char*& src, const char* seps) {
+    skipSpace(src);
     Frag next(src);
     if (next.type == IDENT) src += next.size;
     return Builder::makeContinue(next.type == IDENT ? next.str : IString());
   }
 
-  NodeRef parseSwitch(Frag& frag, char*& src, const char* seps) {
+  NodeRef parseSwitch(char*& src, const char* seps) {
     NodeRef ret = Builder::makeSwitch(parseParenned(src));
-    src = skipSpace(src);
+    skipSpace(src);
     assert(*src == '{');
     src++;
     while (1) {
       // find all cases and possibly a default
-      src = skipSpace(src);
+      skipSpace(src);
       if (*src == '}') break;
       Frag next(src);
       if (next.type == KEYWORD) {
         if (next.str == CASE) {
           src += next.size;
-          src = skipSpace(src);
+          skipSpace(src);
           NodeRef arg;
           Frag value(src);
           if (value.isNumber()) {
@@ -486,21 +510,21 @@ class Parser {
             assert(value.type == OPERATOR);
             assert(value.str == MINUS);
             src += value.size;
-            src = skipSpace(src);
+            skipSpace(src);
             Frag value2(src);
             assert(value2.isNumber());
             arg = Builder::makePrefix(MINUS, parseFrag(value2));
             src += value2.size;
           }
           Builder::appendCaseToSwitch(ret, arg);
-          src = skipSpace(src);
+          skipSpace(src);
           assert(*src == ':');
           src++;
           continue;
         } else if (next.str == DEFAULT) {
           src += next.size;
           Builder::appendDefaultToSwitch(ret);
-          src = skipSpace(src);
+          skipSpace(src);
           assert(*src == ':');
           src++;
           continue;
@@ -508,27 +532,28 @@ class Parser {
         // otherwise, may be some keyword that happens to start a block (e.g. case 1: _return_ 5)
       }
       // not case X: or default: or }, so must be some code
-      src = skipSpace(src);
+      skipSpace(src);
       bool explicitBlock = *src == '{';
-      Builder::appendCodeToSwitch(ret, parseMaybeBracketedBlock(src, ";}", CASE, DEFAULT), explicitBlock);
+      NodeRef subBlock = explicitBlock ? parseBracketedBlock(src) : parseBlock(src, ";}", CASE, DEFAULT);
+      Builder::appendCodeToSwitch(ret, subBlock, explicitBlock);
     }
-    src = skipSpace(src);
+    skipSpace(src);
     assert(*src == '}');
     src++;
     return ret;
   }
 
-  NodeRef parseNew(Frag& frag, char*& src, const char* seps) {
+  NodeRef parseNew(char*& src, const char* seps) {
     return Builder::makeNew(parseElement(src, seps));
   }
 
   NodeRef parseAfterIdent(Frag& frag, char*& src, const char* seps) {
-    assert(!isSpace(*src));
+    skipSpace(src);
     if (*src == '(') return parseExpression(parseCall(parseFrag(frag), src), src, seps);
     if (*src == '[') return parseExpression(parseIndexing(parseFrag(frag), src), src, seps);
     if (*src == ':' && expressionPartsStack.back().size() == 0) {
       src++;
-      src = skipSpace(src);
+      skipSpace(src);
       NodeRef inner;
       if (*src == '{') { // context lets us know this is not an object, but a block
         inner = parseBracketedBlock(src);
@@ -547,12 +572,12 @@ class Parser {
     src++;
     NodeRef ret = Builder::makeCall(target);
     while (1) {
-      src = skipSpace(src);
+      skipSpace(src);
       if (*src == ')') break;
       Builder::appendToCall(ret, parseElement(src, ",)"));
-      src = skipSpace(src);
-      if (*src && *src == ')') break;
-      if (*src && *src == ',') {
+      skipSpace(src);
+      if (*src == ')') break;
+      if (*src == ',') {
         src++;
         continue;
       }
@@ -569,7 +594,7 @@ class Parser {
     assert(*src == '[');
     src++;
     NodeRef ret = Builder::makeIndexing(target, parseElement(src, "]"));
-    src = skipSpace(src);
+    skipSpace(src);
     assert(*src == ']');
     src++;
     assert(expressionPartsStack.back().size() == 0);
@@ -588,9 +613,9 @@ class Parser {
 
   NodeRef parseAfterParen(char*& src) {
     expressionPartsStack.resize(expressionPartsStack.size()+1);
-    src = skipSpace(src);
+    skipSpace(src);
     NodeRef ret = parseElement(src, ")");
-    src = skipSpace(src);
+    skipSpace(src);
     assert(*src == ')');
     src++;
     assert(expressionPartsStack.back().size() == 0);
@@ -602,18 +627,19 @@ class Parser {
     expressionPartsStack.resize(expressionPartsStack.size()+1);
     NodeRef ret = Builder::makeArray();
     while (1) {
-      src = skipSpace(src);
+      skipSpace(src);
       assert(*src);
       if (*src == ']') break;
       NodeRef element = parseElement(src, ",]");
       Builder::appendToArray(ret, element);
-      src = skipSpace(src);
+      skipSpace(src);
+      if (*src == ']') break;
       if (*src == ',') {
         src++;
         continue;
-      } else assert(*src == ']');
+      }
+      abort();
     }
-    assert(*src == ']');
     src++;
     return ret;
   }
@@ -622,56 +648,29 @@ class Parser {
     expressionPartsStack.resize(expressionPartsStack.size()+1);
     NodeRef ret = Builder::makeObject();
     while (1) {
-      src = skipSpace(src);
+      skipSpace(src);
       assert(*src);
       if (*src == '}') break;
       Frag key(src);
       assert(key.type == IDENT || key.type == STRING);
       src += key.size;
-      src = skipSpace(src);
+      skipSpace(src);
       assert(*src == ':');
       src++;
       NodeRef value = parseElement(src, ",}");
       Builder::appendToObject(ret, key.str, value);
-      src = skipSpace(src);
+      skipSpace(src);
+      if (*src == '}') break;
       if (*src == ',') {
         src++;
         continue;
-      } else assert(*src == '}');
+      }
+      abort();
     }
-    assert(*src == '}');
     src++;
     return ret;
   }
 
-  struct ExpressionElement {
-    bool isNode;
-#ifndef _MSC_VER // MSVC does not allow unrestricted unions: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2544.pdf
-	union {
-#endif
-      NodeRef node;
-      IString op;
-#ifndef _MSC_VER
-    };
-#endif
-    ExpressionElement(NodeRef n) : isNode(true), node(n) {}
-    ExpressionElement(IString o) : isNode(false), op(o) {}
-
-    NodeRef getNode() {
-      assert(isNode);
-      return node;
-    }
-    IString getOp() {
-      assert(!isNode);
-      return op;
-    }
-  };
-
-  // This is a list of the current stack of node-operator-node-operator-etc.
-  // this works by each parseExpression call appending to the vector; then recursing out, and the toplevel sorts it all
-  typedef std::vector<ExpressionElement> ExpressionParts;
-  std::vector<ExpressionParts> expressionPartsStack;
-
   void dumpParts(ExpressionParts& parts, int i) {
     printf("expressionparts: %d (at %d)\n", parts.size(), i);
     printf("| ");
@@ -697,7 +696,7 @@ class Parser {
   NodeRef parseExpression(ExpressionElement initial, char*&src, const char* seps) {
     //dump("parseExpression", src);
     ExpressionParts& parts = expressionPartsStack.back();
-    src = skipSpace(src);
+    skipSpace(src);
     if (*src == 0 || hasChar(seps, *src)) {
       if (parts.size() > 0) {
         parts.push_back(initial); // cherry on top of the cake
@@ -787,11 +786,11 @@ class Parser {
   }
 
   // Parses a block of code (e.g. a bunch of statements inside {,}, or the top level of o file)
-  NodeRef parseBlock(char*& src, NodeRef block=nullptr, const char* seps=";", IString keywordSep1=IString(), IString keywordSep2=IString()) {
+  NodeRef parseBlock(char*& src, const char* seps=";", IString keywordSep1=IString(), IString keywordSep2=IString()) {
+    NodeRef block = Builder::makeBlock();
     //dump("parseBlock", src);
-    if (!block) block = Builder::makeBlock();
-    while (*src) {
-      src = skipSpace(src);
+    while (1) {
+      skipSpace(src);
       if (*src == 0) break;
       if (*src == ';') {
         src++; // skip a statement in this block
@@ -812,25 +811,24 @@ class Parser {
     return block;
   }
 
-  NodeRef parseBracketedBlock(char*& src, NodeRef block=nullptr) {
-    if (!block) block = Builder::makeBlock();
-    src = skipSpace(src);
+  NodeRef parseBracketedBlock(char*& src) {
+    skipSpace(src);
     assert(*src == '{');
     src++;
-    parseBlock(src, block, ";}"); // the two are not symmetrical, ; is just internally separating, } is the final one - parseBlock knows all this
+    NodeRef block = parseBlock(src, ";}"); // the two are not symmetrical, ; is just internally separating, } is the final one - parseBlock knows all this
     assert(*src == '}');
     src++;
     return block;
   }
 
   NodeRef parseElementOrStatement(char*& src, const char *seps) {
-    src = skipSpace(src);
+    skipSpace(src);
     if (*src == ';') {
       src++;
       return Builder::makeBlock(); // we don't need the brackets here, but oh well
     }
     NodeRef ret = parseElement(src, seps);
-    src = skipSpace(src);
+    skipSpace(src);
     if (*src == ';') {
       ret = Builder::makeStatement(ret);
       src++;
@@ -839,21 +837,16 @@ class Parser {
   }
 
   NodeRef parseMaybeBracketed(char*& src, const char *seps) {
-    src = skipSpace(src);
+    skipSpace(src);
     return *src == '{' ? parseBracketedBlock(src) : parseElementOrStatement(src, seps);
   }
 
-  NodeRef parseMaybeBracketedBlock(char*& src, const char *seps, IString keywordSep1=IString(), IString keywordSep2=IString()) {
-    src = skipSpace(src);
-    return *src == '{' ? parseBracketedBlock(src) : parseBlock(src, nullptr, seps, keywordSep1, keywordSep2);
-  }
-
   NodeRef parseParenned(char*& src) {
-    src = skipSpace(src);
+    skipSpace(src);
     assert(*src == '(');
     src++;
     NodeRef ret = parseElement(src, ")");
-    src = skipSpace(src);
+    skipSpace(src);
     assert(*src == ')');
     src++;
     return ret;
@@ -897,7 +890,9 @@ class Parser {
   NodeRef parseToplevel(char* src) {
     allSource = src;
     allSize = strlen(src);
-    return parseBlock(src, Builder::makeToplevel());
+    NodeRef toplevel = Builder::makeToplevel();
+    Builder::setBlockContent(toplevel, parseBlock(src));
+    return toplevel;
   }
 };
 
diff --git a/tools/optimizer/simple_ast.cpp b/tools/optimizer/simple_ast.cpp
index f744604f1aaba..3027010d13434 100644
--- a/tools/optimizer/simple_ast.cpp
+++ b/tools/optimizer/simple_ast.cpp
@@ -47,6 +47,14 @@ Ref Arena::alloc() {
   return &chunks.back()[index++];
 }
 
+ArrayStorage* Arena::allocArray() {
+  if (arr_chunks.size() == 0 || arr_index == CHUNK_SIZE) {
+    arr_chunks.push_back(new ArrayStorage[CHUNK_SIZE]);
+    arr_index = 0;
+  }
+  return &arr_chunks.back()[arr_index++];
+}
+
 // dump
 
 void dump(const char *str, Ref node, bool pretty) {
@@ -62,8 +70,9 @@ void dump(const char *str, Ref node, bool pretty) {
 
 struct TraverseInfo {
   TraverseInfo() {}
-  TraverseInfo(Ref node) : node(node), index(0) {}
+  TraverseInfo(Ref node, ArrayStorage* arr) : node(node), arr(arr), index(0) {}
   Ref node;
+  ArrayStorage* arr;
   int index;
 };
 
@@ -123,18 +132,32 @@ void traversePre(Ref node, std::function<void (Ref)> visit) {
   if (!visitable(node)) return;
   visit(node);
   StackedStack<TraverseInfo, TRAV_STACK> stack;
-  stack.push_back(TraverseInfo(node));
-  while (stack.size() > 0) {
-    TraverseInfo& top = stack.back();
-    if (top.index < (int)top.node->size()) {
-      Ref sub = top.node[top.index];
-      top.index++;
+  int index = 0;
+  ArrayStorage* arr = &node->getArray();
+  int arrsize = (int)arr->size();
+  Ref* arrdata = arr->data();
+  stack.push_back(TraverseInfo(node, arr));
+  while (1) {
+    if (index < arrsize) {
+      Ref sub = *(arrdata+index);
+      index++;
       if (visitable(sub)) {
+        stack.back().index = index;
+        index = 0;
         visit(sub);
-        stack.push_back(TraverseInfo(sub));
+        arr = &sub->getArray();
+        arrsize = (int)arr->size();
+        arrdata = arr->data();
+        stack.push_back(TraverseInfo(sub, arr));
       }
     } else {
       stack.pop_back();
+      if (stack.size() == 0) break;
+      TraverseInfo& back = stack.back();
+      index = back.index;
+      arr = back.arr;
+      arrsize = (int)arr->size();
+      arrdata = arr->data();
     }
   }
 }
@@ -144,19 +167,33 @@ void traversePrePost(Ref node, std::function<void (Ref)> visitPre, std::function
   if (!visitable(node)) return;
   visitPre(node);
   StackedStack<TraverseInfo, TRAV_STACK> stack;
-  stack.push_back(TraverseInfo(node));
-  while (stack.size() > 0) {
-    TraverseInfo& top = stack.back();
-    if (top.index < (int)top.node->size()) {
-      Ref sub = top.node[top.index];
-      top.index++;
+  int index = 0;
+  ArrayStorage* arr = &node->getArray();
+  int arrsize = (int)arr->size();
+  Ref* arrdata = arr->data();
+  stack.push_back(TraverseInfo(node, arr));
+  while (1) {
+    if (index < arrsize) {
+      Ref sub = *(arrdata+index);
+      index++;
       if (visitable(sub)) {
+        stack.back().index = index;
+        index = 0;
         visitPre(sub);
-        stack.push_back(TraverseInfo(sub));
+        arr = &sub->getArray();
+        arrsize = (int)arr->size();
+        arrdata = arr->data();
+        stack.push_back(TraverseInfo(sub, arr));
       }
     } else {
-      visitPost(top.node);
+      visitPost(stack.back().node);
       stack.pop_back();
+      if (stack.size() == 0) break;
+      TraverseInfo& back = stack.back();
+      index = back.index;
+      arr = back.arr;
+      arrsize = (int)arr->size();
+      arrdata = arr->data();
     }
   }
 }
@@ -166,20 +203,34 @@ void traversePrePostConditional(Ref node, std::function<bool (Ref)> visitPre, st
   if (!visitable(node)) return;
   if (!visitPre(node)) return;
   StackedStack<TraverseInfo, TRAV_STACK> stack;
-  stack.push_back(TraverseInfo(node));
-  while (stack.size() > 0) {
-    TraverseInfo& top = stack.back();
-    if (top.index < (int)top.node->size()) {
-      Ref sub = top.node[top.index];
-      top.index++;
+  int index = 0;
+  ArrayStorage* arr = &node->getArray();
+  int arrsize = (int)arr->size();
+  Ref* arrdata = arr->data();
+  stack.push_back(TraverseInfo(node, arr));
+  while (1) {
+    if (index < arrsize) {
+      Ref sub = *(arrdata+index);
+      index++;
       if (visitable(sub)) {
         if (visitPre(sub)) {
-          stack.push_back(TraverseInfo(sub));
+          stack.back().index = index;
+          index = 0;
+          arr = &sub->getArray();
+          arrsize = (int)arr->size();
+          arrdata = arr->data();
+          stack.push_back(TraverseInfo(sub, arr));
         }
       }
     } else {
-      visitPost(top.node);
+      visitPost(stack.back().node);
       stack.pop_back();
+      if (stack.size() == 0) break;
+      TraverseInfo& back = stack.back();
+      index = back.index;
+      arr = back.arr;
+      arrsize = (int)arr->size();
+      arrdata = arr->data();
     }
   }
 }
diff --git a/tools/optimizer/simple_ast.h b/tools/optimizer/simple_ast.h
index 0c94a15bb7eb0..5d5e3f2016074 100644
--- a/tools/optimizer/simple_ast.h
+++ b/tools/optimizer/simple_ast.h
@@ -55,14 +55,20 @@ struct Ref {
 
 // Arena allocation, free it all on process exit
 
+typedef std::vector<Ref> ArrayStorage;
+
 struct Arena {
   #define CHUNK_SIZE 1000
   std::vector<Value*> chunks;
   int index; // in last chunk
 
-  Arena() : index(0) {}
+  std::vector<ArrayStorage*> arr_chunks;
+  int arr_index;
+
+  Arena() : index(0), arr_index(0) {}
 
   Ref alloc();
+  ArrayStorage* allocArray();
 };
 
 extern Arena arena;
@@ -80,7 +86,6 @@ struct Value {
 
   Type type;
 
-  typedef std::vector<Ref> ArrayStorage;
   typedef std::unordered_map<IString, Ref> ObjectStorage;
 
 #ifdef _MSC_VER // MSVC does not allow unrestricted unions: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2544.pdf
@@ -115,7 +120,7 @@ struct Value {
   }
 
   void free() {
-    if (type == Array) delete arr;
+    if (type == Array) { arr->clear(); arr->shrink_to_fit(); }
     else if (type == Object) delete obj;
     type = Null;
     num = 0;
@@ -142,14 +147,15 @@ struct Value {
   Value& setArray(ArrayStorage &a) {
     free();
     type = Array;
-    arr = new ArrayStorage();
+    arr = arena.allocArray();
     *arr = a;
     return *this;
   }
-  Value& setArray() {
+  Value& setArray(int size_hint=0) {
     free();
     type = Array;
-    arr = new ArrayStorage();
+    arr = arena.allocArray();
+    arr->reserve(size_hint);
     return *this;
   }
   Value& setNull() {
@@ -443,8 +449,7 @@ struct Value {
 
   Ref& operator[](unsigned x) {
     assert(isArray());
-    assert(x < arr->size());
-    return (*arr)[x];
+    return arr->at(x);
   }
 
   Value& push_back(Ref r) {
@@ -1272,8 +1277,8 @@ class ValueBuilder {
     return &arena.alloc()->setString(s);
   }
 
-  static Ref makeRawArray() {
-    return &arena.alloc()->setArray();
+  static Ref makeRawArray(int size_hint=0) {
+    return &arena.alloc()->setArray(size_hint);
   }
 
   static Ref makeNull() {
@@ -1282,37 +1287,42 @@ class ValueBuilder {
 
 public:
   static Ref makeToplevel() {
-    return &makeRawArray()->push_back(makeRawString(TOPLEVEL))
-                           .push_back(makeRawArray());
+    return &makeRawArray(2)->push_back(makeRawString(TOPLEVEL))
+                            .push_back(makeRawArray());
   }
 
   static Ref makeString(IString str) {
-    return &makeRawArray()->push_back(makeRawString(STRING))
-                           .push_back(makeRawString(str));
+    return &makeRawArray(2)->push_back(makeRawString(STRING))
+                            .push_back(makeRawString(str));
   }
 
   static Ref makeBlock() {
-    return &makeRawArray()->push_back(makeRawString(BLOCK))
-                           .push_back(makeRawArray());
+    return &makeRawArray(2)->push_back(makeRawString(BLOCK))
+                            .push_back(makeRawArray());
   }
 
   static Ref makeName(IString name) {
-    return &makeRawArray()->push_back(makeRawString(NAME))
-                           .push_back(makeRawString(name));
+    return &makeRawArray(2)->push_back(makeRawString(NAME))
+                            .push_back(makeRawString(name));
   }
 
-  static void appendToBlock(Ref block, Ref element) {
-    if (block[0] == BLOCK || block[0] == TOPLEVEL) {
-      block[1]->push_back(element);
-    } else if (block[0] == DEFUN) {
-      block[3]->push_back(element);
+  static void setBlockContent(Ref target, Ref block) {
+    if (target[0] == TOPLEVEL) {
+      target[1]->setArray(block[1]->getArray());
+    } else if (target[0] == DEFUN) {
+      target[3]->setArray(block[1]->getArray());
     } else abort();
   }
 
+  static void appendToBlock(Ref block, Ref element) {
+    assert(block[0] == BLOCK);
+    block[1]->push_back(element);
+  }
+
   static Ref makeCall(Ref target) {
-    return &makeRawArray()->push_back(makeRawString(CALL))
-                           .push_back(target)
-                           .push_back(makeRawArray());
+    return &makeRawArray(3)->push_back(makeRawString(CALL))
+                            .push_back(target)
+                            .push_back(makeRawArray());
   }
 
   static void appendToCall(Ref call, Ref element) {
@@ -1322,16 +1332,16 @@ class ValueBuilder {
 
   static Ref makeStatement(Ref contents) {
     if (statable.has(contents[0]->getIString())) {
-      return &makeRawArray()->push_back(makeRawString(STAT))
-                             .push_back(contents);
+      return &makeRawArray(2)->push_back(makeRawString(STAT))
+                              .push_back(contents);
     } else {
       return contents; // only very specific things actually need to be stat'ed
     }
   }
 
   static Ref makeDouble(double num) {
-    return &makeRawArray()->push_back(makeRawString(NUM))
-                           .push_back(&arena.alloc()->setNumber(num));
+    return &makeRawArray(2)->push_back(makeRawString(NUM))
+                            .push_back(&arena.alloc()->setNumber(num));
   }
   static Ref makeInt(uint32_t num) {
     return makeDouble(double(num));
@@ -1339,33 +1349,33 @@ class ValueBuilder {
 
   static Ref makeBinary(Ref left, IString op, Ref right) {
     if (op == SET) {
-      return &makeRawArray()->push_back(makeRawString(ASSIGN))
-                             .push_back(&arena.alloc()->setBool(true))
-                             .push_back(left)
-                             .push_back(right);
+      return &makeRawArray(4)->push_back(makeRawString(ASSIGN))
+                              .push_back(&arena.alloc()->setBool(true))
+                              .push_back(left)
+                              .push_back(right);
     } else if (op == COMMA) {
-      return &makeRawArray()->push_back(makeRawString(SEQ))
-                             .push_back(left)
-                             .push_back(right);
+      return &makeRawArray(3)->push_back(makeRawString(SEQ))
+                              .push_back(left)
+                              .push_back(right);
     } else {
-      return &makeRawArray()->push_back(makeRawString(BINARY))
-                             .push_back(makeRawString(op))
-                             .push_back(left)
-                             .push_back(right);
+      return &makeRawArray(4)->push_back(makeRawString(BINARY))
+                              .push_back(makeRawString(op))
+                              .push_back(left)
+                              .push_back(right);
     }
   }
 
   static Ref makePrefix(IString op, Ref right) {
-    return &makeRawArray()->push_back(makeRawString(UNARY_PREFIX))
-                           .push_back(makeRawString(op))
-                           .push_back(right);
+    return &makeRawArray(3)->push_back(makeRawString(UNARY_PREFIX))
+                            .push_back(makeRawString(op))
+                            .push_back(right);
   }
 
   static Ref makeFunction(IString name) {
-    return &makeRawArray()->push_back(makeRawString(DEFUN))
-                           .push_back(makeRawString(name))
-                           .push_back(makeRawArray())
-                           .push_back(makeRawArray());
+    return &makeRawArray(4)->push_back(makeRawString(DEFUN))
+                            .push_back(makeRawString(name))
+                            .push_back(makeRawArray())
+                            .push_back(makeRawArray());
   }
 
   static void appendArgumentToFunction(Ref func, IString arg) {
@@ -1374,81 +1384,86 @@ class ValueBuilder {
   }
 
   static Ref makeVar(bool is_const) {
-    return &makeRawArray()->push_back(makeRawString(VAR))
-                           .push_back(makeRawArray());
+    return &makeRawArray(2)->push_back(makeRawString(VAR))
+                            .push_back(makeRawArray());
   }
 
   static void appendToVar(Ref var, IString name, Ref value) {
     assert(var[0] == VAR);
-    Ref array = &makeRawArray()->push_back(makeRawString(name));
+    Ref array = &makeRawArray(1)->push_back(makeRawString(name));
     if (!!value) array->push_back(value);
     var[1]->push_back(array);
   }
 
   static Ref makeReturn(Ref value) {
-    return &makeRawArray()->push_back(makeRawString(RETURN)).push_back(!!value ? value : makeNull());
+    return &makeRawArray(2)->push_back(makeRawString(RETURN))
+                            .push_back(!!value ? value : makeNull());
   }
 
   static Ref makeIndexing(Ref target, Ref index) {
-    return &makeRawArray()->push_back(makeRawString(SUB))
-                           .push_back(target)
-                           .push_back(index);
+    return &makeRawArray(3)->push_back(makeRawString(SUB))
+                            .push_back(target)
+                            .push_back(index);
   }
 
   static Ref makeIf(Ref condition, Ref ifTrue, Ref ifFalse) {
-    return &makeRawArray()->push_back(makeRawString(IF))
-                           .push_back(condition)
-                           .push_back(ifTrue)
-                           .push_back(!!ifFalse ? ifFalse : makeNull());
+    return &makeRawArray(4)->push_back(makeRawString(IF))
+                            .push_back(condition)
+                            .push_back(ifTrue)
+                            .push_back(!!ifFalse ? ifFalse : makeNull());
   }
 
   static Ref makeConditional(Ref condition, Ref ifTrue, Ref ifFalse) {
-    return &makeRawArray()->push_back(makeRawString(CONDITIONAL))
-                           .push_back(condition)
-                           .push_back(ifTrue)
-                           .push_back(ifFalse);
+    return &makeRawArray(4)->push_back(makeRawString(CONDITIONAL))
+                            .push_back(condition)
+                            .push_back(ifTrue)
+                            .push_back(ifFalse);
   }
 
   static Ref makeDo(Ref body, Ref condition) {
-    return &makeRawArray()->push_back(makeRawString(DO))
-                           .push_back(condition)
-                           .push_back(body);
+    return &makeRawArray(3)->push_back(makeRawString(DO))
+                            .push_back(condition)
+                            .push_back(body);
   }
 
   static Ref makeWhile(Ref condition, Ref body) {
-    return &makeRawArray()->push_back(makeRawString(WHILE))
-                           .push_back(condition)
-                           .push_back(body);
+    return &makeRawArray(3)->push_back(makeRawString(WHILE))
+                            .push_back(condition)
+                            .push_back(body);
   }
 
   static Ref makeBreak(IString label) {
-    return &makeRawArray()->push_back(makeRawString(BREAK)).push_back(!!label ? makeRawString(label) : makeNull());
+    return &makeRawArray(2)->push_back(makeRawString(BREAK))
+                            .push_back(!!label ? makeRawString(label) : makeNull());
   }
 
   static Ref makeContinue(IString label) {
-    return &makeRawArray()->push_back(makeRawString(CONTINUE)).push_back(!!label ? makeRawString(label) : makeNull());
+    return &makeRawArray(2)->push_back(makeRawString(CONTINUE))
+                            .push_back(!!label ? makeRawString(label) : makeNull());
   }
 
   static Ref makeLabel(IString name, Ref body) {
-    return &makeRawArray()->push_back(makeRawString(LABEL))
-                           .push_back(makeRawString(name))
-                           .push_back(body);
+    return &makeRawArray(3)->push_back(makeRawString(LABEL))
+                            .push_back(makeRawString(name))
+                            .push_back(body);
   }
 
   static Ref makeSwitch(Ref input) {
-    return &makeRawArray()->push_back(makeRawString(SWITCH))
-                           .push_back(input)
-                           .push_back(makeRawArray());
+    return &makeRawArray(3)->push_back(makeRawString(SWITCH))
+                            .push_back(input)
+                            .push_back(makeRawArray());
   }
 
   static void appendCaseToSwitch(Ref switch_, Ref arg) {
     assert(switch_[0] == SWITCH);
-    switch_[2]->push_back(&makeRawArray()->push_back(arg).push_back(makeRawArray()));
+    switch_[2]->push_back(&makeRawArray(2)->push_back(arg)
+                                           .push_back(makeRawArray()));
   }
 
   static void appendDefaultToSwitch(Ref switch_) {
     assert(switch_[0] == SWITCH);
-    switch_[2]->push_back(&makeRawArray()->push_back(makeNull()).push_back(makeRawArray()));
+    switch_[2]->push_back(&makeRawArray(2)->push_back(makeNull())
+                                           .push_back(makeRawArray()));
   }
 
   static void appendCodeToSwitch(Ref switch_, Ref code, bool explicitBlock) {
@@ -1464,9 +1479,9 @@ class ValueBuilder {
   }
 
   static Ref makeDot(Ref obj, IString key) {
-    return &makeRawArray()->push_back(makeRawString(DOT))
-                           .push_back(obj)
-                           .push_back(makeRawString(key));
+    return &makeRawArray(3)->push_back(makeRawString(DOT))
+                            .push_back(obj)
+                            .push_back(makeRawString(key));
   }
 
   static Ref makeDot(Ref obj, Ref key) {
@@ -1475,13 +1490,13 @@ class ValueBuilder {
   }
 
   static Ref makeNew(Ref call) {
-    return &makeRawArray()->push_back(makeRawString(NEW))
-                           .push_back(call);
+    return &makeRawArray(2)->push_back(makeRawString(NEW))
+                            .push_back(call);
   }
 
   static Ref makeArray() {
-    return &makeRawArray()->push_back(makeRawString(ARRAY))
-                           .push_back(makeRawArray());
+    return &makeRawArray(2)->push_back(makeRawString(ARRAY))
+                            .push_back(makeRawArray());
   }
 
   static void appendToArray(Ref array, Ref element) {
@@ -1490,14 +1505,14 @@ class ValueBuilder {
   }
 
   static Ref makeObject() {
-    return &makeRawArray()->push_back(makeRawString(OBJECT))
-                           .push_back(makeRawArray());
+    return &makeRawArray(2)->push_back(makeRawString(OBJECT))
+                            .push_back(makeRawArray());
   }
 
   static void appendToObject(Ref array, IString key, Ref value) {
     assert(array[0] == OBJECT);
-    array[1]->push_back(&makeRawArray()->push_back(makeRawString(key))
-                                        .push_back(value));
+    array[1]->push_back(&makeRawArray(2)->push_back(makeRawString(key))
+                                         .push_back(value));
   }
 };
 
diff --git a/tools/shared.py b/tools/shared.py
index 0b557e7761cad..b4b9d3d45c5f4 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -166,6 +166,7 @@ def new(*args):
   return new
 
 WINDOWS = sys.platform.startswith('win')
+OSX = sys.platform == 'darwin'
 
 if WINDOWS:
   logging.StreamHandler.emit = add_coloring_to_emit_windows(logging.StreamHandler.emit)
@@ -209,8 +210,10 @@ def new(*args):
   EM_CONFIG = '~/.emscripten'
 if '\n' in EM_CONFIG:
   CONFIG_FILE = None
+  logging.debug('EM_CONFIG is specified inline without a file')
 else:
   CONFIG_FILE = os.path.expanduser(EM_CONFIG)
+  logging.debug('EM_CONFIG is located in ' + CONFIG_FILE)
   if not os.path.exists(CONFIG_FILE):
     # Note: repr is used to ensure the paths are escaped correctly on Windows.
     # The full string is replaced so that the template stays valid Python.
@@ -221,7 +224,7 @@ def new(*args):
     config_file = config_file.replace('\'{{{ EMSCRIPTEN_ROOT }}}\'', repr(__rootpath__))
     llvm_root = os.path.dirname(find_executable('llvm-dis') or '/usr/bin/llvm-dis')
     config_file = config_file.replace('\'{{{ LLVM_ROOT }}}\'', repr(llvm_root))
-    node = find_executable('node') or find_executable('nodejs') or 'node'
+    node = find_executable('nodejs') or find_executable('node') or 'node'
     config_file = config_file.replace('\'{{{ NODE }}}\'', repr(node))
     if WINDOWS:
       tempdir = os.environ.get('TEMP') or os.environ.get('TMP') or 'c:\\temp'
@@ -259,6 +262,12 @@ def new(*args):
   logging.error('Error in evaluating %s (at %s): %s, text: %s' % (EM_CONFIG, CONFIG_FILE, str(e), config_text))
   sys.exit(1)
 
+# Returns a suggestion where current .emscripten config file might be located (if EM_CONFIG env. var is used 
+# without a file, this hints to "default" location at ~/.emscripten)
+def hint_config_file_location():
+  if CONFIG_FILE: return CONFIG_FILE
+  else: return '~/.emscripten'
+
 def listify(x):
   if type(x) is not list: return [x]
   return x
@@ -500,7 +509,7 @@ def check_sanity(force=False):
     try:
       subprocess.call([JAVA, '-version'], stdout=PIPE, stderr=PIPE)
     except:
-      logging.warning('java does not seem to exist, required for closure compiler, which is optional (define JAVA in ~/.emscripten if you want it)')
+      logging.warning('java does not seem to exist, required for closure compiler, which is optional (define JAVA in ' + hint_config_file_location() + ' if you want it)')
 
     if not os.path.exists(CLOSURE_COMPILER):
      logging.warning('Closure compiler (%s) does not exist, check the paths in %s. -O2 and above will fail' % (CLOSURE_COMPILER, EM_CONFIG))
@@ -563,7 +572,7 @@ def get_clang_native_args():
   global CACHED_CLANG_NATIVE_ARGS
   if CACHED_CLANG_NATIVE_ARGS is not None: return CACHED_CLANG_NATIVE_ARGS
   CACHED_CLANG_NATIVE_ARGS = []
-  if sys.platform == 'darwin':
+  if OSX:
     sdk_path = osx_find_native_sdk_path()
     if sdk_path:
       CACHED_CLANG_NATIVE_ARGS = ['-isysroot', osx_find_native_sdk_path()]
@@ -638,8 +647,8 @@ def __init__(self, environ=os.environ):
     except NameError:
       self.TEMP_DIR = find_temp_directory()
       if self.TEMP_DIR == None:
-        logging.critical('TEMP_DIR not defined in ' + os.path.expanduser('~\\.emscripten') + ", and could not detect a suitable directory! Please configure .emscripten to contain a variable TEMP_DIR='/path/to/temp/dir'.")
-      logging.debug('TEMP_DIR not defined in ~/.emscripten, using ' + self.TEMP_DIR)
+        logging.critical('TEMP_DIR not defined in ' + hint_config_file_location() + ", and could not detect a suitable directory! Please configure .emscripten to contain a variable TEMP_DIR='/path/to/temp/dir'.")
+      logging.debug('TEMP_DIR not defined in ' + hint_config_file_location() + ', using ' + self.TEMP_DIR)
 
     if not os.path.isdir(self.TEMP_DIR):
       logging.critical("The temp directory TEMP_DIR='" + self.TEMP_DIR + "' doesn't seem to exist! Please make sure that the path is correct.")
@@ -651,7 +660,7 @@ def __init__(self, environ=os.environ):
         self.EMSCRIPTEN_TEMP_DIR = self.CANONICAL_TEMP_DIR
         safe_ensure_dirs(self.EMSCRIPTEN_TEMP_DIR)
       except Exception, e:
-        logging.error(str(e) + 'Could not create canonical temp dir. Check definition of TEMP_DIR in ~/.emscripten')
+        logging.error(str(e) + 'Could not create canonical temp dir. Check definition of TEMP_DIR in ' + hint_config_file_location())
 
   def get_temp_files(self):
     return tempfiles.TempFiles(
@@ -693,13 +702,13 @@ def set_logging():
 try:
   PYTHON
 except:
-  logging.debug('PYTHON not defined in ~/.emscripten, using "%s"' % (sys.executable,))
+  logging.debug('PYTHON not defined in ' + hint_config_file_location() + ', using "%s"' % (sys.executable,))
   PYTHON = sys.executable
 
 try:
   JAVA
 except:
-  logging.debug('JAVA not defined in ~/.emscripten, using "java"')
+  logging.debug('JAVA not defined in ' + hint_config_file_location() + ', using "java"')
   JAVA = 'java'
 
 # Additional compiler options
diff --git a/tools/validate_asmjs.py b/tools/validate_asmjs.py
index b0f2dd66f3b7f..ab810a7d3e585 100755
--- a/tools/validate_asmjs.py
+++ b/tools/validate_asmjs.py
@@ -18,7 +18,7 @@
 def validate_asmjs_jsfile(filename, muteOutput):
   cmd = shared.SPIDERMONKEY_ENGINE + ['-c', filename]
   if not shared.SPIDERMONKEY_ENGINE or cmd[0] == 'js-not-found' or len(cmd[0].strip()) == 0:
-    print >> sys.stderr, 'Could not find SpiderMonkey engine! Please set tis location to SPIDERMONKEY_ENGINE in your ~/.emscripten configuration file!'
+    print >> sys.stderr, 'Could not find SpiderMonkey engine! Please set tis location to SPIDERMONKEY_ENGINE in your ' + shared.hint_config_file_location() + ' configuration file!'
     return False
   try:
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)