doc/sphinxext/credit_tools.py

"""Create code credit RST file.

Run ./tools/dev/update_credit_json.py first to get the latest PR JSON files.
"""

# Authors: The MNE-Python contributors.
# License: BSD-3-Clause
# Copyright the MNE-Python contributors.

import glob
import json
import pathlib
import re
from collections import defaultdict
from pathlib import Path

import numpy as np
import sphinx.util.logging

import mne
from mne.utils import logger, verbose

sphinx_logger = sphinx.util.logging.getLogger("mne")

repo_root = Path(__file__).parents[2]
doc_root = repo_root / "doc"
data_dir = doc_root / "sphinxext"

# TODO: For contributor names there are three sources of potential truth:
#
# 1. names.inc
# 2. GitHub profile names (that we pull dynamically here)
# 3. commit history / .mailmap.
#
# All three names can mismatch. Currently we try to defer to names.inc since this
# is assumed to have been chosen the most consciously/intentionally by contributors.
# Though it is possible that people can change their preferred names as well, so
# preferring GitHub profile info (when complete!) is probably preferable.

# Allowed singletons
single_names = "btkcodedev buildqa sviter Akshay".split()
# Surnames where we have more than one distinct contributor:
name_counts = dict(
    Bailey=2,
    Das=2,
    Drew=2,
    Li=2,
    Peterson=2,
    Wong=2,
    Zhang=2,
)
# Exceptions, e.g., abbrevitaions in first/last name or all-caps
exceptions = [
    "T. Wang",
    "Ziyi ZENG",
]
# Manual renames
manual_renames = {
    "alexandra": "Alexandra Corneyllie",  # 7600
    "alexandra.corneyllie": "Alexandra Corneyllie",  # 7600
    "akshay0724": "Akshay",  # 4046, TODO: Check singleton
    "AnneSo": "Anne-Sophie Dubarry",  # 4910
    "Basile": "Basile Pinsard",  # 1791
    "ChristinaZhao": "Christina Zhao",  # 9075
    "Drew, J.": "Jordan Drew",  # 10861
    "enzo": "Enzo Altamiranda",  # 11351
    "Frostime": "Yiping Zuo",  # 11773
    "Gennadiy": "Gennadiy Belonosov",  # 11720
    "Genuster": "Gennadiy Belonosov",  # 12936
    "GreasyCat": "Rongfei Jin",  # 13113
    "Hamid": "Hamid Maymandi",  # 10849
    "jwelzel": "Julius Welzel",  # 11118
    "Martin": "Martin Billinger",  # 8099, TODO: Check
    "Mats": "Mats van Es",  # 11068
    "Michael": "Michael Krause",  # 3304
    "Naveen": "Naveen Srinivasan",  # 10787
    "NoahMarkowitz": "Noah Markowitz",  # 12669
    "PAB": "Pierre-Antoine Bannier",  # 9430
    "Rob Luke": "Robert Luke",
    "Sena": "Sena Er",  # 11029
    "TzionaN": "Tziona NessAiver",  # 10953
    "Valerii": "Valerii Chirkov",  # 9043
    "Zhenya": "Evgenii Kalenkovich",  # 6310, TODO: Check
}


def _good_name(name):
    if name is None:
        return False
    assert isinstance(name, str), type(name)
    if not name.strip():
        return False
    if " " not in name and name not in single_names:  # at least two parts
        return False
    if name not in exceptions and "." in name.split()[0] or "." in name.split()[-1]:
        return False
    if " " in name and name not in exceptions:
        first = name.split()[0]
        last = name.split()[-1]
        if first == first.upper() or last == last.upper():  # e.g., KING instead of King
            return False
    return True


@verbose
def generate_credit_rst(app=None, *, verbose=False):
    """Get the credit RST."""
    sphinx_logger.info("Creating code credit RST inclusion file")
    ignores = [
        int(ignore.split("#", maxsplit=1)[1].strip().split()[0][:-1])
        for ignore in (repo_root / ".git-blame-ignore-revs")
        .read_text("utf-8")
        .splitlines()
        if not ignore.strip().startswith("#") and ignore.strip()
    ]
    ignores = {str(ig): [] for ig in ignores}

    # Use mailmap to help translate emails to names
    mailmap = dict()
    # mapping from email to name
    name_map: dict[str, str] = dict()
    for line in (repo_root / ".mailmap").read_text("utf-8").splitlines():
        name = re.match("^([^<]+) <([^<>]+)>", line.strip()).group(1)
        assert _good_name(name), repr(name)
        emails = list(re.findall("<([^<>]+)>", line.strip()))
        assert len(emails) > 0
        new = emails[0]
        if new in name_map:
            assert name_map[new] == name
        else:
            name_map[new] = name
        if len(emails) == 1:
            continue
        for old in emails[1:]:
            if old in mailmap:
                assert new == mailmap[old]  # can be different names
            else:
                mailmap[old] = new
            if old in name_map:
                assert name_map[old] == name
            else:
                name_map[old] = name

    unknown_emails: set[str] = set()

    # dict with (name, commit) keys, values are int change counts
    # ("commits" is really "PRs" for Python mode)
    commits: dict[tuple[str], int] = defaultdict(lambda: 0)

    # dict with filename keys, values are dicts with name keys and +/- ndarrays
    stats: dict[str, dict[str, np.ndarray]] = defaultdict(
        lambda: defaultdict(
            lambda: np.zeros(2, int),
        ),
    )

    bad_commits = set()
    expected_bad_names = dict()

    for fname in sorted(glob.glob(str(data_dir / "prs" / "*.json"))):
        commit = Path(fname).stem  # PR number is in the filename
        data = json.loads(Path(fname).read_text("utf-8"))
        del fname
        assert data != {}
        authors = data["authors"]
        for author in authors:
            if (
                author["e"] is not None
                and author["e"] not in name_map
                and _good_name(author["n"])
            ):
                name_map[author["e"]] = author["n"]
        for file, counts in data["changes"].items():
            if commit in ignores:
                ignores[commit].append([file, commit])
                continue
            p, m = counts["a"], counts["d"]
            used_authors = set()
            for author in authors:
                if author["e"] is not None:
                    if author["e"] not in name_map:
                        unknown_emails.add(
                            f"{author['e'].ljust(29)} "
                            "https://github.com/mne-tools/mne-python/pull/"
                            f"{commit}/files"
                        )
                        continue
                    name = name_map[author["e"]]
                else:
                    name = author["n"]
                    if name in manual_renames:
                        assert _good_name(manual_renames[name]), (
                            f"Bad manual rename: {name}"
                        )
                        name = manual_renames[name]
                    if " " in name:
                        first, last = name.rsplit(" ", maxsplit=1)
                        if last == last.upper() and len(last) > 1:
                            last = last.capitalize()
                        if first == first.upper() and len(first) > 1:
                            first = first.capitalize()
                        name = f"{first} {last}"
                        assert not first.upper() == first, f"Bad {name=} from {commit}"
                    assert _good_name(name), f"Bad {name=} from {commit}"
                    if "King" in name:
                        assert name == "Jean-Rémi King", name

                if name is None:
                    bad_commits.add(commit)
                    continue
                if name in used_authors:
                    continue
                if not _good_name(name) and name not in expected_bad_names:
                    expected_bad_names[name] = f"{name} from #{commit}"
                    if author["e"]:
                        expected_bad_names[name] += f" email {author['e']}"
                assert name.strip(), repr(name)
                used_authors.add(name)
                # treat moves and permission changes like a single-line change
                if p == m == 0:
                    p = 1
                commits[(name, commit)] += p + m
                stats[file][name] += [p, m]
    if bad_commits:
        raise RuntimeError(
            "Run:\nrm "
            + " ".join(f"{bad}.json" for bad in sorted(bad_commits, key=int))
        )

    # Check for duplicate names based on last name, and also singleton names.
    last_map = defaultdict(lambda: set())
    bad_names = set()
    for these_stats in stats.values():
        for name in these_stats:
            assert name == name.strip(), f"Un-stripped name: {repr(name)}"
            last = name.split()[-1]
            first = name.split()[0]
            last_map[last].add(name)
            name_where = expected_bad_names.get(name, name)
            if last == name and name not in single_names:
                bad_names.add(f"Singleton:    {name_where}")
            if "." in last or "." in first and name not in exceptions:
                bad_names.add(f"Abbreviation: {name_where}")
    bad_names = sorted(bad_names)
    for last, names in last_map.items():
        if len(names) > name_counts.get(last, 1):
            bad_names.append(f"Duplicates:    {sorted(names)}")
    if bad_names:
        what = (
            "Unexpected possible duplicates or bad names found, "
            f"consider modifying {'/'.join(Path(__file__).parts[-3:])}:\n"
        )
        raise RuntimeError(what + "\n".join(bad_names))

    unknown_emails = set(
        email
        for email in unknown_emails
        if "autofix-ci[bot]" not in email
        and "pre-commit-ci[bot]" not in email
        and "dependabot[bot]" not in email
        and "github-actions[bot]" not in email
    )
    what = "Unknown emails, consider adding to .mailmap:\n"
    assert len(unknown_emails) == 0, what + "\n".join(sorted(unknown_emails))

    logger.info("Biggest included commits/PRs:")
    commits = dict(
        (k, commits[k])
        for k in sorted(commits, key=lambda k_: commits[k_], reverse=True)
    )
    for ni, name in enumerate(commits, 1):
        if ni > 10:
            break
        logger.info(f"{str(name[1]).ljust(5)} @ {commits[name]:5d} by {name[0]}")

    logger.info("\nIgnored commits:")
    # Report the ignores
    for commit in ignores:  # should have found one of each
        logger.info(f"ignored {len(ignores[commit]):3d} files for {commit}")
        assert len(ignores[commit]) >= 1, (ignores[commit], commit)
    globs = dict()

    # This is the mapping from changed filename globs to module names on the website.
    # We need to include aliases for old stuff. Anything we want to exclude we put in
    # "null" with a higher priority (i.e., in dict first):
    link_overrides = dict()  # overrides for links
    for key in """
        *.qrc *.png *.svg *.ico *.elc *.sfp *.lout *.lay *.csd *.txt
        mne/_version.py mne/externals/* */__init__.py* */resources.py paper.bib
        mne/html/*.css mne/html/*.js mne/io/bti/tests/data/* */SHA1SUMS *__init__py
        AUTHORS.rst CITATION.cff CONTRIBUTING.rst codemeta.json mne/tests/*.* jr-tools
        */whats_new.rst */latest.inc */devel.rst */changelog.rst */manual/* doc/*.json
        logo/LICENSE doc/credit.rst
    """.strip().split():
        globs[key] = "null"
    # Now onto the actual module organization
    root_path = pathlib.Path(mne.__file__).parent
    mod_file_map = dict()
    for file in root_path.iterdir():
        rel = file.relative_to(root_path).with_suffix("")
        mod = f"mne.{rel}"
        if file.is_dir():
            globs[f"mne/{rel}/*.*"] = mod
            globs[f"mne/{rel}.*"] = mod
        elif file.is_file() and file.suffix == ".py":
            key = f"mne/{rel}.py"
            if file.stem == "conftest":
                globs[key] = "maintenance"
                globs["conftest.py"] = "maintenance"
            else:
                globs[key] = mod
                mod_file_map[mod] = key
    globs["mne/artifacts/*.py"] = "mne.preprocessing"
    for key in """
        pick.py constants.py info.py fiff/*.* _fiff/*.* raw.py testing.py _hdf5.py
        compensator.py
    """.strip().split():
        globs[f"mne/{key}"] = "mne.io"
    for key in ("mne/transforms/*.py", "mne/_freesurfer.py"):
        globs[key] = "mne.transforms"
    globs["mne/mixed_norm/*.py"] = "mne.inverse_sparse"
    globs["mne/__main__.py"] = "mne.commands"
    globs["bin/*"] = "mne.commands"
    globs["mne/morph_map.py"] = "mne.surface"
    globs["mne/baseline.py"] = "mne.epochs"
    for key in """
        parallel.py rank.py misc.py data/*.* defaults.py fixes.py icons/*.* icons.*
    """.strip().split():
        globs[f"mne/{key}"] = "mne.utils"
    for key in ("mne/_ola.py", "mne/cuda.py"):
        globs[key] = "mne.filter"
    for key in """
        *digitization/*.py layouts/*.py montages/*.py selection.py
    """.strip().split():
        globs[f"mne/{key}"] = "mne.channels"
    globs["mne/sparse_learning/*.py"] = "mne.inverse_sparse"
    globs["mne/csp.py"] = "mne.preprocessing"
    globs["mne/bem_surfaces.py"] = "mne.bem"
    globs["mne/coreg/*.py"] = "mne.coreg"
    globs["mne/inverse.py"] = "mne.minimum_norm"
    globs["mne/stc.py"] = "mne.source_estimate"
    globs["mne/surfer.py"] = "mne.viz"
    globs["mne/tfr.py"] = "mne.time_frequency"
    globs["mne/connectivity/*.py"] = "mne-connectivity (moved)"
    link_overrides["mne-connectivity (moved)"] = "mne-tools/mne-connectivity"
    globs["mne/realtime/*.py"] = "mne-realtime (moved)"
    link_overrides["mne-realtime (moved)"] = "mne-tools/mne-realtime"
    globs["mne/html_templates/*.*"] = "mne.report"
    globs[".circleci/*"] = "maintenance"
    link_overrides["maintenance"] = "mne-tools/mne-python"
    globs["tools/*"] = "maintenance"
    globs["doc/*"] = "doc"
    for key in ("*.py", "*.rst"):
        for mod in ("examples", "tutorials", "doc"):
            globs[f"{mod}/{key}"] = mod
    for key in """
        *.yml *.md setup.* MANIFEST.in Makefile README.rst flow_diagram.py *.toml
        debian/* logo/*.py *.git* .pre-commit-config.yaml .mailmap .coveragerc make/*
    """.strip().split():
        globs[key] = "maintenance"

    mod_stats = defaultdict(lambda: defaultdict(lambda: np.zeros(2, int)))
    other_files = set()
    total_lines = np.zeros(2, int)
    for fname, counts in stats.items():
        for pattern, mod in globs.items():
            if glob.fnmatch.fnmatch(fname, pattern):
                break
        else:
            other_files.add(fname)
            mod = "other"
        for e, pm in counts.items():
            if mod == "mne._fiff":
                raise RuntimeError
            # sanity check a bit
            if mod != "null" and (".png" in fname or "/manual/" in fname):
                raise RuntimeError(f"Unexpected {mod} {fname}")
            mod_stats[mod][e] += pm
            mod_stats["mne"][e] += pm
            total_lines += pm
    mod_stats.pop("null")  # stuff we shouldn't give credit for
    mod_stats = dict(
        (k, mod_stats[k])
        for k in sorted(
            mod_stats,
            key=lambda x: (
                not x.startswith("mne"),
                x == "maintenance",
                x.replace("-", "."),
            ),
        )
    )  # sort modules alphabetically
    other_files = sorted(other_files)
    if len(other_files):
        raise RuntimeError(
            f"{len(other_files)} misc file(s) found:\n" + "\n".join(other_files)
        )
    logger.info(f"\nTotal line change count: {list(map(int, total_lines))}")

    # sphinx-design badges that we use for contributors
    BADGE_KINDS = ["bdg-info-line", "bdg"]
    content = f"""\
.. THIS FILE IS AUTO-GENERATED BY {Path(__file__).stem} AND WILL BE OVERWRITTEN

.. raw:: html

   <style>
   /* Make it occupy more page width */
   .bd-main .bd-content .bd-article-container {{
       max-width: 90vw;
   }}
   /* Limit max card height */
   div.sd-card-body {{
     max-height: 15em;
   }}
   </style>

.. _code_credit:

Code credit
===========

Below are lists of code contributors to MNE-Python. The numbers in parentheses are the
number of lines changed in our code history.

- :{BADGE_KINDS[0]}:`This badge` is used for the top 10% of contributors.
- :{BADGE_KINDS[1]}:`This badge` is used for the remaining 90% of contributors.

Entire codebase
---------------

"""
    for mi, (mod, counts) in enumerate(mod_stats.items()):
        if mi == 0:
            assert mod == "mne", mod
            indent = " " * 3
        elif mi == 1:
            indent = " " * 6
            content += """

By submodule
------------

Contributors often have domain-specific expertise, so we've broken down the
contributions by submodule as well below.

.. grid:: 1 2 3 3
   :gutter: 1

"""
        # if there are 10 this is 100, if there are 100 this is 100
        these_stats = dict((k, v.sum()) for k, v in counts.items())
        these_stats = dict(
            (k, these_stats[k])
            for k in sorted(these_stats, key=lambda x: these_stats[x], reverse=True)
        )
        if mod in link_overrides:
            link = f"https://github.com/{link_overrides[mod]}"
        else:
            kind = "blame" if mod in mod_file_map else "tree"
            link_mod = mod_file_map.get(mod, mod.replace(".", "/"))
            link = f"https://github.com/mne-tools/mne-python/{kind}/main/{link_mod}"
        assert "moved" not in link, (mod, link)
        # Use badges because they flow nicely, inside a grid to make it more compact
        stat_lines = []
        for ki, (k, v) in enumerate(these_stats.items()):
            # Round to two digits, e.g. 12340 -> 12000, 12560 -> 13000
            v_round = int(float(f"{v:.2g}"))
            assert v_round > 0, f"Got zero lines changed for {k} in {mod}: {v_round}"
            # And then write as a max-3-char human-readable abbreviation like
            # 123, 1.2k, 123k, 12m, etc.
            for prefix in ("", "k", "m", "g"):
                if v_round >= 1000:
                    v_round = v_round / 1000
                else:
                    if v_round >= 10 or prefix == "":  # keep single digit as 1 not 1.0
                        v_round = f"{int(round(v_round))}"
                    else:
                        v_round = f"{v_round:.1f}"
                    v_round += prefix
                    break
            else:
                raise RuntimeError(f"Too many digits in {v}")
            idx = 0 if ki < (len(these_stats) - 1) // 10 + 1 else 1
            if any(b in k for b in ("[bot]", "Lumberbot", "Deleted user")):
                continue
            assert _good_name(k)
            stat_lines.append(f":{BADGE_KINDS[idx]}:`{k} ({v_round})`")
        stat_lines = f"\n{indent}".join(stat_lines)
        if mi == 0:
            content += f"""

.. card:: {mod}
   :class-card: overflow-auto
   :link: https://github.com/mne-tools/mne-python/graphs/contributors

{indent}{stat_lines}

"""
        else:
            content += f"""

   .. grid-item-card:: {mod}
      :class-card: overflow-auto
      :link: {link}

{indent}{stat_lines}

"""
    (doc_root / "code_credit.inc").write_text(content, encoding="utf-8")


if __name__ == "__main__":
    generate_credit_rst(verbose=True)