Support Python 3 in the benchmark suite

broadwaylamb · broadwaylamb · commit cce9e81f0b81 · 2020-02-28T01:45:35.000+03:00
diff --git a/benchmark/scripts/Benchmark_DTrace.in b/benchmark/scripts/Benchmark_DTrace.in
@@ -103,6 +103,7 @@ class DTraceBenchmarkDriver(perf_test_driver.BenchmarkDriver):
                 stdout=subprocess.PIPE,
                 stderr=open("/dev/null", "w"),
                 env=e,
+                universal_newlines=True,
             )
             results = [x for x in p.communicate()[0].split("\n") if len(x) > 0]
             return [
@@ -136,7 +137,9 @@ class DTraceBenchmarkDriver(perf_test_driver.BenchmarkDriver):
             results.append(result_3)
             results.append(single_iter)
 
-        return DTraceResult(test_name, int(not foundInstability), results)
+        return DTraceResult(
+            test_name, int(not foundInstability), results, self.csv_output
+        )
 
 
 SWIFT_BIN_DIR = os.path.dirname(os.path.abspath(__file__))
diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
@@ -26,6 +26,7 @@ class `BenchmarkDoctor` analyzes performance tests, implements `check` COMMAND.
 """
 
 import argparse
+import functools
 import glob
 import logging
 import math
@@ -64,7 +65,9 @@ class BenchmarkDriver(object):
         os.environ["SWIFT_DETERMINISTIC_HASHING"] = "1"
 
     def _invoke(self, cmd):
-        return self._subprocess.check_output(cmd, stderr=self._subprocess.STDOUT)
+        return self._subprocess.check_output(
+            cmd, stderr=self._subprocess.STDOUT, universal_newlines=True
+        )
 
     @property
     def test_harness(self):
@@ -165,7 +168,7 @@ class BenchmarkDriver(object):
         )
         output = self._invoke(cmd)
         results = self.parser.results_from_string(output)
-        return results.items()[0][1] if test else results
+        return list(results.items())[0][1] if test else results
 
     def _cmd_run(
         self,
@@ -207,7 +210,7 @@ class BenchmarkDriver(object):
             a.merge(b)
             return a
 
-        return reduce(
+        return functools.reduce(
             merge_results,
             [
                 self.run(test, measure_memory=True, num_iters=1, quantile=20)
@@ -249,19 +252,21 @@ class BenchmarkDriver(object):
             print(format(values))
 
         def result_values(r):
-            return map(
-                str,
-                [
-                    r.test_num,
-                    r.name,
-                    r.num_samples,
-                    r.min,
-                    r.samples.q1,
-                    r.median,
-                    r.samples.q3,
-                    r.max,
-                    r.max_rss,
-                ],
+            return list(
+                map(
+                    str,
+                    [
+                        r.test_num,
+                        r.name,
+                        r.num_samples,
+                        r.min,
+                        r.samples.q1,
+                        r.median,
+                        r.samples.q3,
+                        r.max,
+                        r.max_rss,
+                    ],
+                )
             )
 
         header = [
@@ -370,7 +375,12 @@ class MarkdownReportHandler(logging.StreamHandler):
         msg = self.format(record)
         stream = self.stream
         try:
-            if isinstance(msg, unicode) and getattr(stream, "encoding", None):
+            # In Python 2 Unicode strings have a special type
+            unicode_type = unicode
+        except NameError:
+            unicode_type = str
+        try:
+            if isinstance(msg, unicode_type) and getattr(stream, "encoding", None):
                 stream.write(msg.encode(stream.encoding))
             else:
                 stream.write(msg)
@@ -487,16 +497,14 @@ class BenchmarkDoctor(object):
         name = measurements["name"]
         setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
         setup = 0 if ratio < 0.05 else setup
-        runtime = min(
-            [
-                (result.samples.min - correction)
-                for i_series in [
-                    BenchmarkDoctor._select(measurements, num_iters=i)
-                    for correction in [(setup / i) for i in [1, 2]]
-                ]
-                for result in i_series
-            ]
-        )
+
+        runtimes = []
+        for i in range(1, 3):
+            correction = setup / i
+            i_series = BenchmarkDoctor._select(measurements, num_iters=i)
+            for result in i_series:
+                runtimes.append(result.samples.min - correction)
+        runtime = min(runtimes)
 
         threshold = 1000
         if threshold < runtime:
@@ -572,7 +580,9 @@ class BenchmarkDoctor(object):
 
     @staticmethod
     def _reasonable_setup_time(measurements):
-        setup = min([result.setup for result in BenchmarkDoctor._select(measurements)])
+        setup = min(
+            [result.setup or 0 for result in BenchmarkDoctor._select(measurements)]
+        )
         if 200000 < setup:  # 200 ms
             BenchmarkDoctor.log_runtime.error(
                 "'%s' setup took at least %d μs.", measurements["name"], setup
@@ -857,6 +867,7 @@ def parse_args(args):
         help="See COMMAND -h for additional arguments",
         metavar="COMMAND",
     )
+    subparsers.required = True
 
     shared_benchmarks_parser = argparse.ArgumentParser(add_help=False)
     benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group()
diff --git a/benchmark/scripts/Benchmark_QuickCheck.in b/benchmark/scripts/Benchmark_QuickCheck.in
@@ -63,6 +63,7 @@ class QuickCheckBenchmarkDriver(perf_test_driver.BenchmarkDriver):
             ],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
+            universal_newlines=True,
         )
         error_out = p.communicate()[1].split("\n")
         result = p.returncode
@@ -76,7 +77,7 @@ class QuickCheckBenchmarkDriver(perf_test_driver.BenchmarkDriver):
         try:
             args = [data, num_iters]
             perf_test_driver.run_with_timeout(self.run_test_inner, args)
-        except Exception, e:
+        except Exception as e:
             sys.stderr.write(
                 "Child Process Failed! (%s,%s). Error: %s\n"
                 % (data["path"], data["test_name"], e)
diff --git a/benchmark/scripts/Benchmark_RuntimeLeaksRunner.in b/benchmark/scripts/Benchmark_RuntimeLeaksRunner.in
@@ -89,6 +89,7 @@ class LeaksRunnerBenchmarkDriver(perf_test_driver.BenchmarkDriver):
             ],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
+            universal_newlines=True,
         )
         error_out = p.communicate()[1].split("\n")
         result = p.returncode
@@ -102,7 +103,7 @@ class LeaksRunnerBenchmarkDriver(perf_test_driver.BenchmarkDriver):
         try:
             args = [data, num_iters]
             result = perf_test_driver.run_with_timeout(self.run_test_inner, args)
-        except Exception, e:
+        except Exception as e:
             sys.stderr.write(
                 "Child Process Failed! (%s,%s). Error: %s\n"
                 % (data["path"], data["test_name"], e)
diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
@@ -30,6 +30,7 @@ class `ReportFormatter` creates the test comparison report in specified format.
 from __future__ import print_function
 
 import argparse
+import functools
 import re
 import sys
 from bisect import bisect, bisect_left, bisect_right
@@ -142,7 +143,7 @@ def num_samples(self):
     @property
     def all_samples(self):
         """List of all samples in ascending order."""
-        return sorted(self.samples + self.outliers, key=lambda s: s.i)
+        return sorted(self.samples + self.outliers, key=lambda s: s.i or -1)
 
     @property
     def min(self):
@@ -189,13 +190,16 @@ def sd(self):
         return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1))
 
     @staticmethod
-    def running_mean_variance((k, M_, S_), x):
+    def running_mean_variance(stats, x):
         """Compute running variance, B. P. Welford's method.
 
         See Knuth TAOCP vol 2, 3rd edition, page 232, or
         https://www.johndcook.com/blog/standard_deviation/
         M is mean, Standard Deviation is defined as sqrt(S/k-1)
         """
+
+        (k, M_, S_) = stats
+
         k = float(k + 1)
         M = M_ + (x - M_) / k
         S = S_ + (x - M_) * (x - M)
@@ -247,7 +251,7 @@ def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=Fal
             runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:]
             if delta:
                 runtimes = [int(x) if x else 0 for x in runtimes]
-                runtimes = reduce(
+                runtimes = functools.reduce(
                     lambda l, x: l.append(l[-1] + x) or l if l else [x],  # runnin
                     runtimes,
                     None,
@@ -315,7 +319,8 @@ def merge(self, r):
         """
         # Statistics
         if self.samples and r.samples:
-            map(self.samples.add, r.samples.samples)
+            for sample in r.samples.samples:
+                self.samples.add(sample)
             sams = self.samples
             self.num_samples = sams.num_samples
             self.min, self.max, self.median, self.mean, self.sd = (
@@ -490,7 +495,7 @@ def add_or_merge(names, r):
                 names[r.name].merge(r)
             return names
 
-        return reduce(add_or_merge, tests, dict())
+        return functools.reduce(add_or_merge, tests, dict())
 
     @staticmethod
     def results_from_string(log_contents):
@@ -544,10 +549,12 @@ def __init__(self, old_results, new_results, delta_threshold):
         def compare(name):
             return ResultComparison(old_results[name], new_results[name])
 
-        comparisons = map(compare, comparable_tests)
+        comparisons = list(map(compare, comparable_tests))
 
         def partition(l, p):
-            return reduce(lambda x, y: x[not p(y)].append(y) or x, l, ([], []))
+            return functools.reduce(
+                lambda x, y: x[not p(y)].append(y) or x, l, ([], [])
+            )
 
         decreased, not_decreased = partition(
             comparisons, lambda c: c.ratio < (1 - delta_threshold)
@@ -668,7 +675,7 @@ def _column_widths(self):
         def max_widths(maximum, widths):
             return map(max, zip(maximum, widths))
 
-        return reduce(max_widths, widths, [0] * 5)
+        return list(functools.reduce(max_widths, widths, [0] * 5))
 
     def _formatted_text(
         self, label_formatter, COLUMN_SEPARATOR, DELIMITER_ROW, SEPARATOR, SECTION
diff --git a/benchmark/scripts/perf_test_driver/perf_test_driver.py b/benchmark/scripts/perf_test_driver/perf_test_driver.py
@@ -111,7 +111,8 @@ def process_input(self, data):
     def run_for_opt_level(self, binary, opt_level, test_filter):
         print("testing driver at path: %s" % binary)
         names = []
-        for l in subprocess.check_output([binary, "--list"]).split("\n")[1:]:
+        output = subprocess.check_output([binary, "--list"], universal_newlines=True)
+        for l in output.split("\n")[1:]:
             m = BENCHMARK_OUTPUT_RE.match(l)
             if m is None:
                 continue
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
@@ -15,15 +15,33 @@
 
 import logging
 import os
+import sys
 import time
 import unittest
-from StringIO import StringIO
-from imp import load_source
+
+try:
+    # for Python 2
+    from StringIO import StringIO
+except ImportError:
+    # for Python 3
+    from io import StringIO
 
 from compare_perf_tests import PerformanceTestResult
 
 from test_utils import Mock, MockLoggingHandler, Stub, captured_output
 
+
+# imp.load_source is deprecated in Python 3.4
+if sys.version_info < (3, 4):
+    from imp import load_source
+else:
+
+    def load_source(name, path):
+        from importlib.machinery import SourceFileLoader
+
+        return SourceFileLoader(name, path).load_module()
+
+
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
     "Benchmark_Driver",
@@ -46,7 +64,17 @@ def assert_contains(self, texts, output):
     def test_requires_command_argument(self):
         with captured_output() as (_, err):
             self.assertRaises(SystemExit, parse_args, [])
-        self.assert_contains(["usage:", "COMMAND", "too few arguments"], err.getvalue())
+
+        if sys.version_info < (3, 3):
+            self.assert_contains(
+                ["usage:", "COMMAND", "too few arguments"], err.getvalue()
+            )
+        else:
+            # The error message has changed in Python 3.3
+            self.assert_contains(
+                ["usage:", "COMMAND", "the following arguments are required"],
+                err.getvalue(),
+            )
 
     def test_command_help_lists_commands(self):
         with captured_output() as (out, _):
@@ -151,7 +179,14 @@ class SubprocessMock(Mock):
     def __init__(self, responses=None):
         super(SubprocessMock, self).__init__(responses)
 
-        def _check_output(args, stdin=None, stdout=None, stderr=None, shell=False):
+        def _check_output(
+            args,
+            stdin=None,
+            stdout=None,
+            stderr=None,
+            shell=False,
+            universal_newlines=False,
+        ):
             return self.record_and_respond(args, stdin, stdout, stderr, shell)
 
         self.check_output = _check_output
@@ -190,8 +225,8 @@ def test_gets_list_of_precommit_benchmarks(self):
         self.subprocess_mock.assert_called_all_expected()
         self.assertEqual(driver.tests, ["Benchmark1", "Benchmark2"])
         self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2"])
-        self.assertEquals(driver.test_number["Benchmark1"], "1")
-        self.assertEquals(driver.test_number["Benchmark2"], "2")
+        self.assertEqual(driver.test_number["Benchmark1"], "1")
+        self.assertEqual(driver.test_number["Benchmark2"], "2")
 
     list_all_tests = (
         "/benchmarks/Benchmark_O --list --delim=\t --skip-tags=".split(" "),
@@ -330,10 +365,10 @@ def test_parse_results_from_running_benchmarks(self):
         """
         r = self.driver.run("b")
         self.assertTrue(self.parser_stub.results_from_string_called)
-        self.assertEquals(r.name, "b1")  # non-matching name, just 1st result
+        self.assertEqual(r.name, "b1")  # non-matching name, just 1st result
         r = self.driver.run()
         self.assertTrue(isinstance(r, dict))
-        self.assertEquals(r["b1"].name, "b1")
+        self.assertEqual(r["b1"].name, "b1")
 
     def test_measure_memory(self):
         self.driver.run("b", measure_memory=True)
@@ -412,7 +447,11 @@ def test_log_results(self):
 
         def assert_log_written(out, log_file, content):
             self.assertEqual(out.getvalue(), "Logging results to: " + log_file + "\n")
-            with open(log_file, "rU") as f:
+            if sys.version_info < (3, 0):
+                openmode = "rU"
+            else:
+                openmode = "r"  # 'U' mode is deprecated in Python 3
+            with open(log_file, openmode) as f:
                 text = f.read()
             self.assertEqual(text, "formatted output")
 
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
diff --git a/benchmark/scripts/test_utils.py b/benchmark/scripts/test_utils.py