Skip to content

Commit e3d5ad1

Browse files
committed
CompoundDB: implemented simple dict base first-level cache which really helps to improve performance in real-world applications, which need to quickly determine whether objects are in or out for instance, as it happens during index_to_tree conversion
Added separate pack streaming test which shows only a throughput of 250 streams / s in a densely packed pack, and about 3.5 MiB of data throughput. Performance tests show that half the time is spent in collecting the numerous deltas, the other one in decompressing and applying them Fixed broken performance tests
1 parent d3a0037 commit e3d5ad1

File tree

8 files changed

+64
-8
lines changed

8 files changed

+64
-8
lines changed

db/base.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,19 +183,29 @@ class CompoundDB(ObjectDBR, LazyMixin, CachingDB):
183183
184184
Databases are stored in the lazy-loaded _dbs attribute.
185185
Define _set_cache_ to update it with your databases"""
186-
187186
def _set_cache_(self, attr):
188187
if attr == '_dbs':
189188
self._dbs = list()
189+
elif attr == '_db_cache':
190+
self._db_cache = dict()
191+
else:
192+
super(CompoundDB, self)._set_cache_(attr)
190193

191194
def _db_query(self, sha):
192195
""":return: database containing the given 20 or 40 byte sha
193196
:raise BadObject:"""
194197
# most databases use binary representations, prevent converting
195198
# it everytime a database is being queried
196199
sha = to_bin_sha(sha)
200+
try:
201+
return self._db_cache[sha]
202+
except KeyError:
203+
pass
204+
# END first level cache
205+
197206
for db in self._dbs:
198207
if db.has_object(sha):
208+
self._db_cache[sha] = db
199209
return db
200210
# END for each database
201211
raise BadObject(sha)
@@ -232,6 +242,8 @@ def databases(self):
232242
return tuple(self._dbs)
233243

234244
def update_cache(self, force=False):
245+
# something might have changed, clear everything
246+
self._db_cache.clear()
235247
stat = False
236248
for db in self._dbs:
237249
if isinstance(db, CachingDB):

db/git.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ def _set_cache_(self, attr):
5757

5858
# finally set the value
5959
self._loose_db = loose_db
60-
61-
# END handle dbs
60+
else:
61+
super(GitDB, self)._set_cache_(attr)
62+
# END handle attrs
6263

6364
#{ ObjectDBW interface
6465

db/pack.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(self, root_path):
4646
def _set_cache_(self, attr):
4747
if attr == '_entities':
4848
self._entities = list()
49-
self.update_cache()
49+
self.update_cache(force=True)
5050
# END handle entities initialization
5151

5252
def _sort_entities(self):

db/ref.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ def _set_cache_(self, attr):
2121
if attr == '_dbs':
2222
self._dbs = list()
2323
self._update_dbs_from_ref_file()
24-
# END handle dbs
24+
else:
25+
super(ReferenceDB, self)._set_cache_(attr)
26+
# END handle attrs
2527

2628
def _update_dbs_from_ref_file(self):
2729
dbcls = self.ObjectDBCls

exc.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Module with common exceptions"""
2+
from util import to_hex_sha
23

34
class ODBError(Exception):
45
"""All errors thrown by the object database"""
@@ -7,7 +8,11 @@ class InvalidDBRoot(ODBError):
78
"""Thrown if an object database cannot be initialized at the given path"""
89

910
class BadObject(ODBError):
10-
"""The object with the given SHA does not exist"""
11+
"""The object with the given SHA does not exist. Instantiate with the
12+
failed sha"""
13+
14+
def __str__(self):
15+
return "BadObject: %s" % to_hex_sha(self.args[0])
1116

1217
class BadObjectType(ODBError):
1318
"""The object had an unsupported type"""

test/performance/test_pack.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ def test_pack_random_access(self):
7272
total_kib = total_size / 1000
7373
print >> sys.stderr, "PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib/elapsed , elapsed, max_items / elapsed)
7474

75-
76-
print >> sys.stderr, "Endurance run: verify streaming of %i objects (crc and sha)" % ns
75+
def _disabled_test_correctness(self):
76+
pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack"))
77+
# disabled for now as it used to work perfectly, checking big repositories takes a long time
78+
print >> sys.stderr, "Endurance run: verify streaming of objects (crc and sha)"
7779
for crc in range(2):
7880
count = 0
7981
st = time()
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""Specific test for pack streams only"""
2+
from lib import (
3+
TestBigRepoR
4+
)
5+
6+
from gitdb.db.pack import PackedDB
7+
8+
import os
9+
import sys
10+
from time import time
11+
12+
class TestPackStreamingPerformance(TestBigRepoR):
13+
14+
def test_stream_reading(self):
15+
pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack"))
16+
17+
# streaming only, meant for --with-profile runs
18+
ni = 5000
19+
count = 0
20+
pdb_stream = pdb.stream
21+
total_size = 0
22+
st = time()
23+
for sha in pdb.sha_iter():
24+
if count == ni:
25+
break
26+
stream = pdb_stream(sha)
27+
stream.read()
28+
total_size += stream.size
29+
count += 1
30+
elapsed = time() - st
31+
total_kib = total_size / 1000
32+
print >> sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (ni, total_kib, total_kib/elapsed , elapsed, ni / elapsed)
33+

test/performance/test_stream.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Performance data streaming performance"""
22
from lib import TestBigRepoR
33
from gitdb.db import *
4+
from gitdb.base import *
45
from gitdb.stream import *
56
from gitdb.util import pool
67
from gitdb.typ import str_blob_type

0 commit comments

Comments
 (0)