Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Fix UnicodeDecodeError when reading packed-refs with non-UTF8 characters
Fixes #2064

The packed-refs file can contain ref names that are not valid UTF-8
(e.g., Latin-1 encoded tag names created by older Git versions or
non-UTF8 systems). Previously, opening the file with encoding='UTF-8'
would raise UnicodeDecodeError.

Changes:
- Add errors='surrogateescape' to the open() call in _iter_packed_refs()
- This allows reading files with arbitrary byte sequences while still
  treating valid UTF-8 as text
- Add test that verifies non-UTF8 packed-refs can be read successfully

The 'surrogateescape' error handler is the standard Python approach for
handling potentially non-UTF8 data in filesystem operations, as it
preserves the original bytes in a reversible way.
  • Loading branch information
MirrorDNA-Reflection-Protocol committed Dec 7, 2025
commit 40af3b395cebed928b0eb68fe1fdb70bee1f9288
2 changes: 1 addition & 1 deletion git/refs/symbolic.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def _iter_packed_refs(cls, repo: "Repo") -> Iterator[Tuple[str, str]]:
The packed refs file will be kept open as long as we iterate.
"""
try:
with open(cls._get_packed_refs_path(repo), "rt", encoding="UTF-8") as fp:
with open(cls._get_packed_refs_path(repo), "rt", encoding="UTF-8", errors="surrogateescape") as fp:
for line in fp:
line = line.strip()
if not line:
Expand Down
39 changes: 39 additions & 0 deletions test/test_refs.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,45 @@ def test_tag_message(self, rw_repo):
)
assert tag_ref.tag.message == "test2"

@with_rw_repo("0.1.6")
def test_packed_refs_with_non_utf8_encoding(self, rw_repo):
"""Test that packed-refs files with non-UTF8 encoded ref names can be read.

This addresses issue #2064 where GitPython would fail with UnicodeDecodeError
when reading packed-refs files containing non-UTF8 characters (e.g., Latin-1
encoded tag names).
"""
# Create a tag with ASCII name first
TagReference.create(rw_repo, "normal-tag")

# Pack refs
rw_repo.git.pack_refs(all=True)

# Manually insert a non-UTF8 ref into the packed-refs file
# Using Latin-1 characters that are invalid UTF-8
packed_refs_path = osp.join(rw_repo.common_dir, "packed-refs")

with open(packed_refs_path, "rb") as f:
content = f.read()

# Add a fake ref with Latin-1 encoded name (ñ = 0xF1 in Latin-1, invalid UTF-8)
# Using a valid SHA from the repo
head_sha = rw_repo.head.commit.hexsha
non_utf8_line = f"\n{head_sha} refs/tags/caf\xf1\n".encode("latin-1")

with open(packed_refs_path, "wb") as f:
f.write(content + non_utf8_line)

# This should NOT raise UnicodeDecodeError with the fix
# It should successfully read all tags including the non-UTF8 one
tags = list(rw_repo.tags)
assert len(tags) >= 1

# Verify we can iterate packed refs without error
from git.refs import SymbolicReference
packed_refs = list(SymbolicReference._iter_packed_refs(rw_repo))
assert len(packed_refs) >= 2 # At least normal-tag and the non-UTF8 tag

def test_dereference_recursive(self):
# For now, just test the HEAD.
assert SymbolicReference.dereference_recursive(self.rorepo, "HEAD")
Expand Down
Loading