diff --git a/CHANGES.rst b/CHANGES.rst index 822b0fecd..e7585d4ba 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,6 +22,23 @@ upgrading your version of coverage.py. .. scriv-start-here +.. _changes_7-5-3: + +Version 7.5.3 — 2024-05-28 +-------------------------- + +- Performance improvements for combining data files, especially when measuring + line coverage. A few different quadratic behaviors were eliminated. In one + extreme case of combining 700+ data files, the time dropped from more than + three hours to seven minutes. Thanks for Kraken Tech for funding the fix. + +- Performance improvements for generating HTML reports, with a side benefit of + reducing memory use, closing `issue 1791`_. Thanks to Daniel Diniz for + helping to diagnose the problem. + +.. _issue 1791: https://github.com/nedbat/coveragepy/issues/1791 + + .. _changes_7-5-2: Version 7.5.2 — 2024-05-24 diff --git a/coverage/control.py b/coverage/control.py index dbca2013d..614dd8d49 100644 --- a/coverage/control.py +++ b/coverage/control.py @@ -998,7 +998,7 @@ def _prepare_data_for_reporting(self) -> None: if self.config.paths: mapped_data = CoverageData(warn=self._warn, debug=self._debug, no_disk=True) if self._data is not None: - mapped_data.update(self._data, aliases=self._make_aliases()) + mapped_data.update(self._data, map_path=self._make_aliases().map) self._data = mapped_data def report( diff --git a/coverage/data.py b/coverage/data.py index 9513adfca..1252e4147 100644 --- a/coverage/data.py +++ b/coverage/data.py @@ -12,6 +12,7 @@ from __future__ import annotations +import functools import glob import hashlib import os.path @@ -134,6 +135,11 @@ def combine_parallel_data( if strict and not files_to_combine: raise NoDataError("No data to combine") + if aliases is None: + map_path = None + else: + map_path = functools.lru_cache(maxsize=None)(aliases.map) + file_hashes = set() combined_any = False @@ -176,7 +182,7 @@ def combine_parallel_data( message(f"Couldn't combine data file {rel_file_name}: {exc}") delete_this_one = False else: - data.update(new_data, aliases=aliases) + data.update(new_data, map_path=map_path) combined_any = True if message: message(f"Combined data file {rel_file_name}") diff --git a/coverage/phystokens.py b/coverage/phystokens.py index a42d184a6..8a7f9db6b 100644 --- a/coverage/phystokens.py +++ b/coverage/phystokens.py @@ -6,7 +6,6 @@ from __future__ import annotations import ast -import functools import io import keyword import re @@ -163,20 +162,15 @@ def source_token_lines(source: str) -> TSourceTokenLines: yield line -@functools.lru_cache(maxsize=100) def generate_tokens(text: str) -> TokenInfos: - """A cached version of `tokenize.generate_tokens`. + """A helper around `tokenize.generate_tokens`. - When reporting, coverage.py tokenizes files twice, once to find the - structure of the file, and once to syntax-color it. Tokenizing is - expensive, and easily cached. + Originally this was used to cache the results, but it didn't seem to make + reporting go faster, and caused issues with using too much memory. - Unfortunately, the HTML report code tokenizes all the files the first time - before then tokenizing them a second time, so we cache many. Ideally we'd - rearrange the code to tokenize each file twice before moving onto the next. """ readline = io.StringIO(text).readline - return list(tokenize.generate_tokens(readline)) + return tokenize.generate_tokens(readline) def source_encoding(source: bytes) -> str: diff --git a/coverage/sqldata.py b/coverage/sqldata.py index f12ccd7a9..c6ba11a94 100644 --- a/coverage/sqldata.py +++ b/coverage/sqldata.py @@ -21,13 +21,12 @@ import zlib from typing import ( - cast, Any, Collection, Mapping, + cast, Any, Callable, Collection, Mapping, Sequence, ) from coverage.debug import NoDebugging, auto_repr from coverage.exceptions import CoverageException, DataError -from coverage.files import PathAliases from coverage.misc import file_be_gone, isolate_module from coverage.numbits import numbits_to_nums, numbits_union, nums_to_numbits from coverage.sqlitedb import SqliteDb @@ -647,12 +646,16 @@ def purge_files(self, filenames: Collection[str]) -> None: continue con.execute_void(sql, (file_id,)) - def update(self, other_data: CoverageData, aliases: PathAliases | None = None) -> None: - """Update this data with data from several other :class:`CoverageData` instances. + def update( + self, + other_data: CoverageData, + map_path: Callable[[str], str] | None = None, + ) -> None: + """Update this data with data from another :class:`CoverageData`. - If `aliases` is provided, it's a `PathAliases` object that is used to - re-map paths to match the local machine's. Note: `aliases` is None - only when called directly from the test suite. + If `map_path` is provided, it's a function that re-map paths to match + the local machine's. Note: `map_path` is None only when called + directly from the test suite. """ if self._debug.should("dataop"): @@ -664,7 +667,7 @@ def update(self, other_data: CoverageData, aliases: PathAliases | None = None) - if self._has_arcs and other_data._has_lines: raise DataError("Can't combine line data with arc data") - aliases = aliases or PathAliases() + map_path = map_path or (lambda p: p) # Force the database we're writing to to exist before we start nesting contexts. self._start_using() @@ -674,7 +677,7 @@ def update(self, other_data: CoverageData, aliases: PathAliases | None = None) - with other_data._connect() as con: # Get files data. with con.execute("select path from file") as cur: - files = {path: aliases.map(path) for (path,) in cur} + files = {path: map_path(path) for (path,) in cur} # Get contexts data. with con.execute("select context from context") as cur: @@ -729,7 +732,7 @@ def update(self, other_data: CoverageData, aliases: PathAliases | None = None) - "inner join file on file.id = tracer.file_id", ) as cur: this_tracers.update({ - aliases.map(path): tracer + map_path(path): tracer for path, tracer in cur }) @@ -767,27 +770,15 @@ def update(self, other_data: CoverageData, aliases: PathAliases | None = None) - # Prepare arc and line rows to be inserted by converting the file # and context strings with integer ids. Then use the efficient # `executemany()` to insert all rows at once. - arc_rows = ( - (file_ids[file], context_ids[context], fromno, tono) - for file, context, fromno, tono in arcs - ) - - # Get line data. - with con.execute( - "select file.path, context.context, line_bits.numbits " + - "from line_bits " + - "inner join file on file.id = line_bits.file_id " + - "inner join context on context.id = line_bits.context_id", - ) as cur: - for path, context, numbits in cur: - key = (aliases.map(path), context) - if key in lines: - numbits = numbits_union(lines[key], numbits) - lines[key] = numbits if arcs: self._choose_lines_or_arcs(arcs=True) + arc_rows = ( + (file_ids[file], context_ids[context], fromno, tono) + for file, context, fromno, tono in arcs + ) + # Write the combined data. con.executemany_void( "insert or ignore into arc " + @@ -797,15 +788,25 @@ def update(self, other_data: CoverageData, aliases: PathAliases | None = None) - if lines: self._choose_lines_or_arcs(lines=True) - con.execute_void("delete from line_bits") + + for (file, context), numbits in lines.items(): + with con.execute( + "select numbits from line_bits where file_id = ? and context_id = ?", + (file_ids[file], context_ids[context]), + ) as cur: + existing = list(cur) + if existing: + lines[(file, context)] = numbits_union(numbits, existing[0][0]) + con.executemany_void( - "insert into line_bits " + + "insert or replace into line_bits " + "(file_id, context_id, numbits) values (?, ?, ?)", [ (file_ids[file], context_ids[context], numbits) for (file, context), numbits in lines.items() ], ) + con.executemany_void( "insert or ignore into tracer (file_id, tracer) values (?, ?)", ((file_ids[filename], tracer) for filename, tracer in tracer_map.items()), diff --git a/coverage/version.py b/coverage/version.py index 0c47cedea..b41c0ef71 100644 --- a/coverage/version.py +++ b/coverage/version.py @@ -8,7 +8,7 @@ # version_info: same semantics as sys.version_info. # _dev: the .devN suffix if any. -version_info = (7, 5, 2, "final", 0) +version_info = (7, 5, 3, "final", 0) _dev = 0 diff --git a/doc/conf.py b/doc/conf.py index e746730cc..acd3b2713 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -67,11 +67,11 @@ # @@@ editable copyright = "2009–2024, Ned Batchelder" # pylint: disable=redefined-builtin # The short X.Y.Z version. -version = "7.5.2" +version = "7.5.3" # The full version, including alpha/beta/rc tags. -release = "7.5.2" +release = "7.5.3" # The date of release, in "monthname day, year" format. -release_date = "May 24, 2024" +release_date = "May 28, 2024" # @@@ end rst_epilog = f""" diff --git a/doc/sample_html/class_index.html b/doc/sample_html/class_index.html index ad1948c35..390e1a839 100644 --- a/doc/sample_html/class_index.html +++ b/doc/sample_html/class_index.html @@ -56,8 +56,8 @@

Classes

- coverage.py v7.5.2, - created at 2024-05-24 16:52 -0400 + coverage.py v7.5.3, + created at 2024-05-28 09:37 -0400

@@ -537,8 +537,8 @@

diff --git a/doc/sample_html/z_7b071bdc2a35fa80_makefiles_py.html b/doc/sample_html/z_7b071bdc2a35fa80_makefiles_py.html index 1b25d0242..cf45038ef 100644 --- a/doc/sample_html/z_7b071bdc2a35fa80_makefiles_py.html +++ b/doc/sample_html/z_7b071bdc2a35fa80_makefiles_py.html @@ -66,8 +66,8 @@

^ index     » next       - coverage.py v7.5.2, - created at 2024-05-24 16:52 -0400 + coverage.py v7.5.3, + created at 2024-05-28 09:37 -0400