From 62815b5c5a4c39e9ace3d20ec0c593011201dbcf Mon Sep 17 00:00:00 2001 From: Thomas Grainger Date: Tue, 4 Oct 2016 18:04:24 +0100 Subject: [PATCH] support optional gitdb_speedups --- README.rst | 7 + gitdb/_delta_apply.c | 1154 ------------------------------------------ gitdb/_delta_apply.h | 6 - gitdb/_fun.c | 107 ---- gitdb/fun.py | 2 +- gitdb/pack.py | 2 +- gitdb/stream.py | 2 +- setup.cfg | 2 + setup.py | 154 ++---- 9 files changed, 49 insertions(+), 1387 deletions(-) delete mode 100644 gitdb/_delta_apply.c delete mode 100644 gitdb/_delta_apply.h delete mode 100644 gitdb/_fun.c create mode 100644 setup.cfg diff --git a/README.rst b/README.rst index 766d9d6..1b754d0 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,13 @@ From `PyPI `_ pip install gitdb +SPEEDUPS +======== + +If you want to go up to 20% faster, you can install gitdb-speedups with: + + pip install gitdb-speedups + REQUIREMENTS ============ diff --git a/gitdb/_delta_apply.c b/gitdb/_delta_apply.c deleted file mode 100644 index f4fffdc..0000000 --- a/gitdb/_delta_apply.c +++ /dev/null @@ -1,1154 +0,0 @@ -#include <_delta_apply.h> -#include -#include -#include -#include -#include - - - -typedef unsigned long long ull; -typedef unsigned int uint; -typedef unsigned char uchar; -typedef unsigned short ushort; -typedef uchar bool; - -// Constants -const ull gDIV_grow_by = 100; - - - -// DELTA STREAM ACCESS -/////////////////////// -inline -ull msb_size(const uchar** datap, const uchar* top) -{ - const uchar *data = *datap; - ull cmd, size = 0; - uint i = 0; - do { - cmd = *data++; - size |= (cmd & 0x7f) << i; - i += 7; - } while (cmd & 0x80 && data < top); - *datap = data; - return size; -} - - -// TOP LEVEL STREAM INFO -///////////////////////////// -typedef struct { - const uchar *tds; // Toplevel delta stream - const uchar *cstart; // start of the chunks - Py_ssize_t tdslen; // size of tds in bytes - Py_ssize_t target_size; // size of the target buffer which can hold all data - uint num_chunks; // amount of chunks in the delta stream - PyObject *parent_object; -} ToplevelStreamInfo; - - -void TSI_init(ToplevelStreamInfo* info) -{ - info->tds = NULL; - info->cstart = NULL; - info->tdslen = 0; - info->num_chunks = 0; - info->target_size = 0; - info->parent_object = 0; -} - -void TSI_destroy(ToplevelStreamInfo* info) -{ -#ifdef DEBUG - fprintf(stderr, "TSI_destroy: %p\n", info); -#endif - - if (info->parent_object){ - Py_DECREF(info->parent_object); - info->parent_object = NULL; - } else if (info->tds){ - PyMem_Free((void*)info->tds); - } - info->tds = NULL; - info->cstart = NULL; - info->tdslen = 0; - info->num_chunks = 0; -} - -inline -const uchar* TSI_end(ToplevelStreamInfo* info) -{ - return info->tds + info->tdslen; -} - -inline -const uchar* TSI_first(ToplevelStreamInfo* info) -{ - return info->cstart; -} - -// set the stream, and initialize it -// initialize our set stream to point to the first chunk -// Fill in the header information, which is the base and target size -inline -void TSI_set_stream(ToplevelStreamInfo* info, const uchar* stream) -{ - info->tds = stream; - info->cstart = stream; - - assert(info->tds && info->tdslen); - - // init stream - const uchar* tdsend = TSI_end(info); - msb_size(&info->cstart, tdsend); // base size - info->target_size = msb_size(&info->cstart, tdsend); -} - - - -// duplicate the data currently owned by the parent object drop its refcount -// return 1 on success -bool TSI_copy_stream_from_object(ToplevelStreamInfo* info) -{ - assert(info->parent_object); - - uchar* ptmp = PyMem_Malloc(info->tdslen); - if (!ptmp){ - return 0; - } - uint ofs = (uint)(info->cstart - info->tds); - memcpy((void*)ptmp, info->tds, info->tdslen); - - info->tds = ptmp; - info->cstart = ptmp + ofs; - - Py_DECREF(info->parent_object); - info->parent_object = 0; - - return 1; -} - -// Transfer ownership of the given stream into our instance. The amount of chunks -// remains the same, and needs to be set by the caller -void TSI_replace_stream(ToplevelStreamInfo* info, const uchar* stream, uint streamlen) -{ - assert(info->parent_object == 0); - - uint ofs = (uint)(info->cstart - info->tds); - if (info->tds){ - PyMem_Free((void*)info->tds); - } - info->tds = stream; - info->cstart = info->tds + ofs; - info->tdslen = streamlen; - -} - -// DELTA CHUNK -//////////////// -// Internal Delta Chunk Objects -// They are just used to keep information parsed from a stream -// The data pointer is always shared -typedef struct { - ull to; - uint ts; - uint so; - const uchar* data; -} DeltaChunk; - -// forward declarations -const uchar* next_delta_info(const uchar*, DeltaChunk*); - -inline -void DC_init(DeltaChunk* dc, ull to, ull ts, ull so, const uchar* data) -{ - dc->to = to; - dc->ts = ts; - dc->so = so; - dc->data = NULL; -} - - -inline -ull DC_rbound(const DeltaChunk* dc) -{ - return dc->to + dc->ts; -} - -inline -void DC_print(const DeltaChunk* dc, const char* prefix) -{ - fprintf(stderr, "%s-dc: to = %i, ts = %i, so = %i, data = %p\n", prefix, (int)dc->to, dc->ts, dc->so, dc->data); -} - -// Apply -inline -void DC_apply(const DeltaChunk* dc, const uchar* base, PyObject* writer, PyObject* tmpargs) -{ - PyObject* buffer = 0; - if (dc->data){ - buffer = PyBuffer_FromMemory((void*)dc->data, dc->ts); - } else { - buffer = PyBuffer_FromMemory((void*)(base + dc->so), dc->ts); - } - - if (PyTuple_SetItem(tmpargs, 0, buffer)){ - assert(0); - } - - - // tuple steals reference, and will take care about the deallocation - PyObject_Call(writer, tmpargs, NULL); - -} - -// Encode the information in the given delta chunk and write the byte-stream -// into the given output stream -// It will be copied into the given bounds, the given size must be the final size -// and work with the given relative offset - hence the bounds are assumed to be -// correct and to fit within the unaltered dc -inline -void DC_encode_to(const DeltaChunk* dc, uchar** pout, uint ofs, uint size) -{ - uchar* out = *pout; - if (dc->data){ - *out++ = (uchar)size; - memcpy(out, dc->data+ofs, size); - out += size; - } else { - uchar i = 0x80; - uchar* op = out++; - uint moff = dc->so+ofs; - - if (moff & 0x000000ff) - *out++ = moff >> 0, i |= 0x01; - if (moff & 0x0000ff00) - *out++ = moff >> 8, i |= 0x02; - if (moff & 0x00ff0000) - *out++ = moff >> 16, i |= 0x04; - if (moff & 0xff000000) - *out++ = moff >> 24, i |= 0x08; - - if (size & 0x00ff) - *out++ = size >> 0, i |= 0x10; - if (size & 0xff00) - *out++ = size >> 8, i |= 0x20; - - *op = i; - } - - *pout = out; -} - -// Return: amount of bytes one would need to encode dc -inline -ushort DC_count_encode_bytes(const DeltaChunk* dc) -{ - if (dc->data){ - return 1 + dc->ts; // cmd byte + actual data bytes - } else { - ushort c = 1; // cmd byte - uint ts = dc->ts; - ull so = dc->so; - - // offset - c += (so & 0x000000FF) > 0; - c += (so & 0x0000FF00) > 0; - c += (so & 0x00FF0000) > 0; - c += (so & 0xFF000000) > 0; - - // size - max size is 0x10000, its encoded with 0 size bits - c += (ts & 0x000000FF) > 0; - c += (ts & 0x0000FF00) > 0; - - return c; - } -} - - - -// DELTA INFO -///////////// -typedef struct { - uint dso; // delta stream offset, relative to the very start of the stream - uint to; // target offset (cache) -} DeltaInfo; - - -// DELTA INFO VECTOR -////////////////////// - -typedef struct { - DeltaInfo *mem; // Memory for delta infos - uint di_last_size; // size of the last element - we can't compute it using the next bound - const uchar *dstream; // borrowed ointer to delta stream we index - Py_ssize_t size; // Amount of DeltaInfos - Py_ssize_t reserved_size; // Reserved amount of DeltaInfos -} DeltaInfoVector; - - - -// Reserve enough memory to hold the given amount of delta chunks -// Return 1 on success -// NOTE: added a minimum allocation to assure reallocation is not done -// just for a single additional entry. DIVs change often, and reallocs are expensive -inline -int DIV_reserve_memory(DeltaInfoVector* vec, uint num_dc) -{ - if (num_dc <= vec->reserved_size){ - return 1; - } - -#ifdef DEBUG - bool was_null = vec->mem == NULL; -#endif - - if (vec->mem == NULL){ - vec->mem = PyMem_Malloc(num_dc * sizeof(DeltaInfo)); - } else { - vec->mem = PyMem_Realloc(vec->mem, num_dc * sizeof(DeltaInfo)); - } - - if (vec->mem == NULL){ - Py_FatalError("Could not allocate memory for append operation"); - } - - vec->reserved_size = num_dc; - -#ifdef DEBUG - const char* format = "Allocated %i bytes at %p, to hold up to %i chunks\n"; - if (!was_null) - format = "Re-allocated %i bytes at %p, to hold up to %i chunks\n"; - fprintf(stderr, format, (int)(vec->reserved_size * sizeof(DeltaInfo)), vec->mem, (int)vec->reserved_size); -#endif - - return vec->mem != NULL; -} - -/* -Grow the delta chunk list by the given amount of bytes. -This may trigger a realloc, but will do nothing if the reserved size is already -large enough. -Return 1 on success, 0 on failure -*/ -inline -int DIV_grow_by(DeltaInfoVector* vec, uint num_dc) -{ - return DIV_reserve_memory(vec, vec->reserved_size + num_dc); -} - -int DIV_init(DeltaInfoVector* vec, ull initial_size) -{ - vec->mem = NULL; - vec->dstream = NULL; - vec->size = 0; - vec->reserved_size = 0; - vec->di_last_size = 0; - - return DIV_grow_by(vec, initial_size); -} - -inline -Py_ssize_t DIV_len(const DeltaInfoVector* vec) -{ - return vec->size; -} - -inline -uint DIV_lbound(const DeltaInfoVector* vec) -{ - assert(vec->size && vec->mem); - return vec->mem->to; -} - -// Return item at index -inline -DeltaInfo* DIV_get(const DeltaInfoVector* vec, Py_ssize_t i) -{ - assert(i < vec->size && vec->mem); - return &vec->mem[i]; -} - -// Return last item -inline -DeltaInfo* DIV_last(const DeltaInfoVector* vec) -{ - return DIV_get(vec, vec->size-1); -} - -inline -int DIV_empty(const DeltaInfoVector* vec) -{ - return vec->size == 0; -} - -// Return end pointer of the vector -inline -const DeltaInfo* DIV_end(const DeltaInfoVector* vec) -{ - assert(!DIV_empty(vec)); - return vec->mem + vec->size; -} - -// return first item in vector -inline -DeltaInfo* DIV_first(const DeltaInfoVector* vec) -{ - assert(!DIV_empty(vec)); - return vec->mem; -} - -// return rbound offset in bytes. We use information contained in the -// vec to do that -inline -uint DIV_info_rbound(const DeltaInfoVector* vec, const DeltaInfo* di) -{ - if (DIV_last(vec) == di){ - return di->to + vec->di_last_size; - } else { - return (di+1)->to; - } -} - -// return size of the given delta info item -inline -uint DIV_info_size2(const DeltaInfoVector* vec, const DeltaInfo* di, const DeltaInfo* const veclast) -{ - if (veclast == di){ - return vec->di_last_size; - } else { - return (di+1)->to - di->to; - } -} - -// return size of the given delta info item -inline -uint DIV_info_size(const DeltaInfoVector* vec, const DeltaInfo* di) -{ - return DIV_info_size2(vec, di, DIV_last(vec)); -} - -void DIV_destroy(DeltaInfoVector* vec) -{ - if (vec->mem){ -#ifdef DEBUG - fprintf(stderr, "DIV_destroy: %p\n", (void*)vec->mem); -#endif - PyMem_Free(vec->mem); - vec->size = 0; - vec->reserved_size = 0; - vec->mem = 0; - } -} - -// Reset this vector so that its existing memory can be filled again. -// Memory will be kept, but not cleaned up -inline -void DIV_forget_members(DeltaInfoVector* vec) -{ - vec->size = 0; -} - -// Reset the vector so that its size will be zero -// It will keep its memory though, and hence can be filled again -inline -void DIV_reset(DeltaInfoVector* vec) -{ - if (vec->size == 0) - return; - vec->size = 0; -} - - -// Append one chunk to the end of the list, and return a pointer to it -// It will not have been initialized ! -inline -DeltaInfo* DIV_append(DeltaInfoVector* vec) -{ - if (vec->size + 1 > vec->reserved_size){ - DIV_grow_by(vec, gDIV_grow_by); - } - - DeltaInfo* next = vec->mem + vec->size; - vec->size += 1; - return next; -} - -// Return delta chunk being closest to the given absolute offset -inline -DeltaInfo* DIV_closest_chunk(const DeltaInfoVector* vec, ull ofs) -{ - assert(vec->mem); - - ull lo = 0; - ull hi = vec->size; - ull mid; - DeltaInfo* di; - - while (lo < hi) - { - mid = (lo + hi) / 2; - di = vec->mem + mid; - if (di->to > ofs){ - hi = mid; - } else if ((DIV_info_rbound(vec, di) > ofs) | (di->to == ofs)) { - return di; - } else { - lo = mid + 1; - } - } - - return DIV_last(vec); -} - - -// Return the amount of chunks a slice at the given spot would have, as well as -// its size in bytes it would have if the possibly partial chunks would be encoded -// and added to the spot marked by sdc -uint DIV_count_slice_bytes(const DeltaInfoVector* src, uint ofs, uint size) -{ - uint num_bytes = 0; - DeltaInfo* cdi = DIV_closest_chunk(src, ofs); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - // partial overlap - if (cdi->to != ofs) { - const ull relofs = ofs - cdi->to; - const uint cdisize = DIV_info_size(src, cdi); - const uint max_size = cdisize - relofs < size ? cdisize - relofs : size; - size -= max_size; - - // get the size in bytes the info would have - next_delta_info(src->dstream + cdi->dso, &dc); - dc.so += relofs; - dc.ts = max_size; - num_bytes += DC_count_encode_bytes(&dc); - - cdi += 1; - - if (size == 0){ - return num_bytes; - } - } - - const DeltaInfo* const vecend = DIV_end(src); - const uchar* nstream; - for( ;cdi < vecend; ++cdi){ - nstream = next_delta_info(src->dstream + cdi->dso, &dc); - - if (dc.ts < size) { - num_bytes += nstream - (src->dstream + cdi->dso); - size -= dc.ts; - } else { - dc.ts = size; - num_bytes += DC_count_encode_bytes(&dc); - size = 0; - break; - } - } - - assert(size == 0); - return num_bytes; -} - -// Write a slice as defined by its absolute offset in bytes and its size into the given -// destination memory. The individual chunks written will be a byte copy of the source -// data chunk stream -// Return: number of chunks in the slice -uint DIV_copy_slice_to(const DeltaInfoVector* src, uchar** dest, ull tofs, uint size) -{ - assert(DIV_lbound(src) <= tofs); - assert((tofs + size) <= DIV_info_rbound(src, DIV_last(src))); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - DeltaInfo* cdi = DIV_closest_chunk(src, tofs); - uint num_chunks = 0; - - // partial overlap - if (cdi->to != tofs) { - const uint relofs = tofs - cdi->to; - next_delta_info(src->dstream + cdi->dso, &dc); - const uint max_size = dc.ts - relofs < size ? dc.ts - relofs : size; - - size -= max_size; - - // adjust dc proportions - DC_encode_to(&dc, dest, relofs, max_size); - - num_chunks += 1; - cdi += 1; - - if (size == 0){ - return num_chunks; - } - } - - const uchar* dstream = src->dstream + cdi->dso; - const uchar* nstream = dstream; - for( ; nstream; dstream = nstream) - { - num_chunks += 1; - nstream = next_delta_info(dstream, &dc); - if (dc.ts < size) { - memcpy(*dest, dstream, nstream - dstream); - *dest += nstream - dstream; - size -= dc.ts; - } else { - DC_encode_to(&dc, dest, 0, size); - size = 0; - break; - } - } - - assert(size == 0); - return num_chunks; -} - - -// Take slices of div into the corresponding area of the tsi, which is the topmost -// delta to apply. -bool DIV_connect_with_base(ToplevelStreamInfo* tsi, DeltaInfoVector* div) -{ - assert(tsi->num_chunks); - - - uint num_bytes = 0; - const uchar* data = TSI_first(tsi); - const uchar* dend = TSI_end(tsi); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - - // COMPUTE SIZE OF TARGET STREAM - ///////////////////////////////// - for (;data < dend;) - { - data = next_delta_info(data, &dc); - - // Data chunks don't need processing - if (dc.data){ - num_bytes += 1 + dc.ts; - continue; - } - - num_bytes += DIV_count_slice_bytes(div, dc.so, dc.ts); - } - assert(DC_rbound(&dc) == tsi->target_size); - - - // GET NEW DELTA BUFFER - //////////////////////// - uchar *const dstream = PyMem_Malloc(num_bytes); - if (!dstream){ - return 0; - } - - - data = TSI_first(tsi); - const uchar *ndata = data; - dend = TSI_end(tsi); - - uint num_chunks = 0; - uchar* ds = dstream; - DC_init(&dc, 0, 0, 0, NULL); - - // pick slices from the delta and put them into the new stream - for (; data < dend; data = ndata) - { - ndata = next_delta_info(data, &dc); - - // Data chunks don't need processing - if (dc.data){ - // just copy it over - memcpy((void*)ds, (void*)data, ndata - data); - ds += ndata - data; - num_chunks += 1; - continue; - } - - // Copy Chunks - num_chunks += DIV_copy_slice_to(div, &ds, dc.so, dc.ts); - } - assert(ds - dstream == num_bytes); - assert(num_chunks >= tsi->num_chunks); - assert(DC_rbound(&dc) == tsi->target_size); - - // finally, replace the streams - TSI_replace_stream(tsi, dstream, num_bytes); - tsi->cstart = dstream; // we have NO header ! - assert(tsi->tds == dstream); - tsi->num_chunks = num_chunks; - - - return 1; - -} - -// DELTA CHUNK LIST (PYTHON) -///////////////////////////// -// Internally, it has nothing to do with a ChunkList anymore though -typedef struct { - PyObject_HEAD - // ----------- - ToplevelStreamInfo istream; - -} DeltaChunkList; - - - -int DCL_init(DeltaChunkList*self, PyObject *args, PyObject *kwds) -{ - if(args && PySequence_Size(args) > 0){ - PyErr_SetString(PyExc_ValueError, "Too many arguments"); - return -1; - } - - TSI_init(&self->istream); - return 0; -} - - -void DCL_dealloc(DeltaChunkList* self) -{ - TSI_destroy(&(self->istream)); -} - - -PyObject* DCL_py_rbound(DeltaChunkList* self) -{ - return PyLong_FromUnsignedLongLong(self->istream.target_size); -} - -// Write using a write function, taking remaining bytes from a base buffer - -PyObject* DCL_apply(DeltaChunkList* self, PyObject* args) -{ - PyObject* pybuf = 0; - PyObject* writeproc = 0; - if (!PyArg_ParseTuple(args, "OO", &pybuf, &writeproc)){ - PyErr_BadArgument(); - return NULL; - } - - if (!PyObject_CheckReadBuffer(pybuf)){ - PyErr_SetString(PyExc_ValueError, "First argument must be a buffer-compatible object, like a string, or a memory map"); - return NULL; - } - - if (!PyCallable_Check(writeproc)){ - PyErr_SetString(PyExc_ValueError, "Second argument must be a writer method with signature write(buf)"); - return NULL; - } - - const uchar* base; - Py_ssize_t baselen; - PyObject_AsReadBuffer(pybuf, (const void**)&base, &baselen); - - PyObject* tmpargs = PyTuple_New(1); - - const uchar* data = TSI_first(&self->istream); - const uchar* const dend = TSI_end(&self->istream); - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - while (data < dend){ - data = next_delta_info(data, &dc); - DC_apply(&dc, base, writeproc, tmpargs); - } - - Py_DECREF(tmpargs); - Py_RETURN_NONE; -} - -PyMethodDef DCL_methods[] = { - {"apply", (PyCFunction)DCL_apply, METH_VARARGS, "Apply the given iterable of delta streams" }, - {"rbound", (PyCFunction)DCL_py_rbound, METH_NOARGS, NULL}, - {NULL} /* Sentinel */ -}; - -PyTypeObject DeltaChunkListType = { - PyObject_HEAD_INIT(NULL) - 0, /*ob_size*/ - "DeltaChunkList", /*tp_name*/ - sizeof(DeltaChunkList), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - (destructor)DCL_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - "Minimal Delta Chunk List",/* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - DCL_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)DCL_init, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ -}; - - -// Makes a new copy of the DeltaChunkList - you have to do everything yourselve -// in C ... want C++ !! -DeltaChunkList* DCL_new_instance(void) -{ - DeltaChunkList* dcl = (DeltaChunkList*) PyType_GenericNew(&DeltaChunkListType, 0, 0); - assert(dcl); - - DCL_init(dcl, 0, 0); - return dcl; -} - -// Read the next delta chunk from the given stream and advance it -// dc will contain the parsed information, its offset must be set by -// the previous call of next_delta_info, which implies it should remain the -// same instance between the calls. -// Return the altered uchar pointer, reassign it to the input data -inline -const uchar* next_delta_info(const uchar* data, DeltaChunk* dc) -{ - const char cmd = *data++; - - if (cmd & 0x80) - { - uint cp_off = 0, cp_size = 0; - if (cmd & 0x01) cp_off = *data++; - if (cmd & 0x02) cp_off |= (*data++ << 8); - if (cmd & 0x04) cp_off |= (*data++ << 16); - if (cmd & 0x08) cp_off |= ((unsigned) *data++ << 24); - if (cmd & 0x10) cp_size = *data++; - if (cmd & 0x20) cp_size |= (*data++ << 8); - if (cmd & 0x40) cp_size |= (*data++ << 16); // this should never get hit with current deltas ... - if (cp_size == 0) cp_size = 0x10000; - - dc->to += dc->ts; - dc->data = NULL; - dc->so = cp_off; - dc->ts = cp_size; - - } else if (cmd) { - // Just share the data - dc->to += dc->ts; - dc->data = data; - dc->ts = cmd; - dc->so = 0; - - data += cmd; - } else { - PyErr_SetString(PyExc_RuntimeError, "Encountered an unsupported delta cmd: 0"); - assert(0); - return NULL; - } - - return data; -} - -// Return amount of chunks encoded in the given delta stream -// If read_header is True, then the header msb chunks will be read first. -// Otherwise, the stream is assumed to be scrubbed one past the header -uint compute_chunk_count(const uchar* data, const uchar* dend, bool read_header) -{ - // read header - if (read_header){ - msb_size(&data, dend); - msb_size(&data, dend); - } - - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - uint num_chunks = 0; - - while (data < dend) - { - data = next_delta_info(data, &dc); - num_chunks += 1; - }// END handle command opcodes - - return num_chunks; -} - -PyObject* connect_deltas(PyObject *self, PyObject *dstreams) -{ - // obtain iterator - PyObject* stream_iter = 0; - if (!PyIter_Check(dstreams)){ - stream_iter = PyObject_GetIter(dstreams); - if (!stream_iter){ - PyErr_SetString(PyExc_RuntimeError, "Couldn't obtain iterator for streams"); - return NULL; - } - } else { - stream_iter = dstreams; - } - - DeltaInfoVector div; - ToplevelStreamInfo tdsinfo; - TSI_init(&tdsinfo); - DIV_init(&div, 0); - - - // GET TOPLEVEL DELTA STREAM - int error = 0; - PyObject* ds = 0; - unsigned int dsi = 0; // delta stream index we process - ds = PyIter_Next(stream_iter); - if (!ds){ - error = 1; - goto _error; - } - - dsi += 1; - tdsinfo.parent_object = PyObject_CallMethod(ds, "read", 0); - if (!PyObject_CheckReadBuffer(tdsinfo.parent_object)){ - Py_DECREF(ds); - error = 1; - goto _error; - } - - PyObject_AsReadBuffer(tdsinfo.parent_object, (const void**)&tdsinfo.tds, &tdsinfo.tdslen); - if (tdsinfo.tdslen > pow(2, 32)){ - // parent object is deallocated by info structure - Py_DECREF(ds); - PyErr_SetString(PyExc_RuntimeError, "Cannot handle deltas larger than 4GB"); - tdsinfo.parent_object = 0; - - error = 1; - goto _error; - } - Py_DECREF(ds); - - // let it officially know, and initialize its internal state - TSI_set_stream(&tdsinfo, tdsinfo.tds); - - // INTEGRATE ANCESTOR DELTA STREAMS - for (ds = PyIter_Next(stream_iter); ds != NULL; ds = PyIter_Next(stream_iter), ++dsi) - { - // Its important to initialize this before the next block which can jump - // to code who needs this to exist ! - PyObject* db = 0; - - // When processing the first delta, we know we will have to alter the tds - // Hence we copy it and deallocate the parent object - if (dsi == 1) { - if (!TSI_copy_stream_from_object(&tdsinfo)){ - PyErr_SetString(PyExc_RuntimeError, "Could not allocate memory to copy toplevel buffer"); - // info structure takes care of the parent_object - error = 1; - goto loop_end; - } - - tdsinfo.num_chunks = compute_chunk_count(tdsinfo.cstart, TSI_end(&tdsinfo), 0); - } - - db = PyObject_CallMethod(ds, "read", 0); - if (!PyObject_CheckReadBuffer(db)){ - error = 1; - PyErr_SetString(PyExc_RuntimeError, "Returned buffer didn't support the buffer protocol"); - goto loop_end; - } - - // Fill the stream info structure - const uchar* data; - Py_ssize_t dlen; - PyObject_AsReadBuffer(db, (const void**)&data, &dlen); - const uchar* const dstart = data; - const uchar* const dend = data + dlen; - div.dstream = dstart; - - if (dlen > pow(2, 32)){ - error = 1; - PyErr_SetString(PyExc_RuntimeError, "Cannot currently handle deltas larger than 4GB"); - goto loop_end; - } - - // READ HEADER - msb_size(&data, dend); - const ull target_size = msb_size(&data, dend); - - DIV_reserve_memory(&div, compute_chunk_count(data, dend, 0)); - - // parse command stream - DeltaInfo* di = 0; // temporary pointer - DeltaChunk dc; - DC_init(&dc, 0, 0, 0, NULL); - - assert(data < dend); - while (data < dend) - { - di = DIV_append(&div); - di->dso = data - dstart; - if ((data = next_delta_info(data, &dc))){ - di->to = dc.to; - } else { - error = 1; - goto loop_end; - } - }// END handle command opcodes - - // finalize information - div.di_last_size = dc.ts; - - if (DC_rbound(&dc) != target_size){ - PyErr_SetString(PyExc_RuntimeError, "Failed to parse delta stream"); - error = 1; - } - - #ifdef DEBUG - fprintf(stderr, "------------ Stream %i --------\n ", (int)dsi); - fprintf(stderr, "Before Connect: tdsinfo: num_chunks = %i, bytelen = %i KiB, target_size = %i KiB\n", (int)tdsinfo.num_chunks, (int)tdsinfo.tdslen/1000, (int)tdsinfo.target_size/1000); - fprintf(stderr, "div->num_chunks = %i, div->reserved_size = %i, div->bytelen=%i KiB\n", (int)div.size, (int)div.reserved_size, (int)dlen/1000); - #endif - - if (!DIV_connect_with_base(&tdsinfo, &div)){ - error = 1; - } - - #ifdef DEBUG - fprintf(stderr, "after connect: tdsinfo->num_chunks = %i, tdsinfo->bytelen = %i KiB\n", (int)tdsinfo.num_chunks, (int)tdsinfo.tdslen/1000); - #endif - - // destroy members, but keep memory - DIV_reset(&div); - -loop_end: - // perform cleanup - Py_DECREF(ds); - Py_DECREF(db); - - if (error){ - break; - } - }// END for each stream object - - if (dsi == 0){ - PyErr_SetString(PyExc_ValueError, "No streams provided"); - } - - -_error: - - if (stream_iter != dstreams){ - Py_DECREF(stream_iter); - } - - - DIV_destroy(&div); - - // Return the actual python object - its just a container - DeltaChunkList* dcl = DCL_new_instance(); - if (!dcl){ - PyErr_SetString(PyExc_RuntimeError, "Couldn't allocate list"); - // Otherwise tdsinfo would be deallocated by the chunk list - TSI_destroy(&tdsinfo); - error = 1; - } else { - // Plain copy, transfer ownership to dcl - dcl->istream = tdsinfo; - } - - if (error){ - // Will dealloc tdcv - Py_XDECREF(dcl); - return NULL; - } - - return (PyObject*)dcl; -} - - -// Write using a write function, taking remaining bytes from a base buffer -// replaces the corresponding method in python -PyObject* apply_delta(PyObject* self, PyObject* args) -{ - PyObject* pybbuf = 0; - PyObject* pydbuf = 0; - PyObject* pytbuf = 0; - if (!PyArg_ParseTuple(args, "OOO", &pybbuf, &pydbuf, &pytbuf)){ - PyErr_BadArgument(); - return NULL; - } - - PyObject* objects[] = { pybbuf, pydbuf, pytbuf }; - assert(sizeof(objects) / sizeof(PyObject*) == 3); - - uint i; - for(i = 0; i < 3; i++){ - if (!PyObject_CheckReadBuffer(objects[i])){ - PyErr_SetString(PyExc_ValueError, "Argument must be a buffer-compatible object, like a string, or a memory map"); - return NULL; - } - } - - Py_ssize_t lbbuf; Py_ssize_t ldbuf; Py_ssize_t ltbuf; - const uchar* bbuf; const uchar* dbuf; - uchar* tbuf; - PyObject_AsReadBuffer(pybbuf, (const void**)(&bbuf), &lbbuf); - PyObject_AsReadBuffer(pydbuf, (const void**)(&dbuf), &ldbuf); - - if (PyObject_AsWriteBuffer(pytbuf, (void**)(&tbuf), <buf)){ - PyErr_SetString(PyExc_ValueError, "Argument 3 must be a writable buffer"); - return NULL; - } - - const uchar* data = dbuf; - const uchar* dend = dbuf + ldbuf; - - while (data < dend) - { - const char cmd = *data++; - - if (cmd & 0x80) - { - unsigned long cp_off = 0, cp_size = 0; - if (cmd & 0x01) cp_off = *data++; - if (cmd & 0x02) cp_off |= (*data++ << 8); - if (cmd & 0x04) cp_off |= (*data++ << 16); - if (cmd & 0x08) cp_off |= ((unsigned) *data++ << 24); - if (cmd & 0x10) cp_size = *data++; - if (cmd & 0x20) cp_size |= (*data++ << 8); - if (cmd & 0x40) cp_size |= (*data++ << 16); - if (cp_size == 0) cp_size = 0x10000; - - memcpy(tbuf, bbuf + cp_off, cp_size); - tbuf += cp_size; - - } else if (cmd) { - memcpy(tbuf, data, cmd); - tbuf += cmd; - data += cmd; - } else { - PyErr_SetString(PyExc_RuntimeError, "Encountered an unsupported delta cmd: 0"); - return NULL; - } - }// END handle command opcodes - - Py_RETURN_NONE; -} diff --git a/gitdb/_delta_apply.h b/gitdb/_delta_apply.h deleted file mode 100644 index 1fcd538..0000000 --- a/gitdb/_delta_apply.h +++ /dev/null @@ -1,6 +0,0 @@ -#include - -extern PyObject* connect_deltas(PyObject *self, PyObject *dstreams); -extern PyObject* apply_delta(PyObject* self, PyObject* args); - -extern PyTypeObject DeltaChunkListType; diff --git a/gitdb/_fun.c b/gitdb/_fun.c deleted file mode 100644 index 4997038..0000000 --- a/gitdb/_fun.c +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include "_delta_apply.h" - -static PyObject *PackIndexFile_sha_to_index(PyObject *self, PyObject *args) -{ - const unsigned char *sha; - const unsigned int sha_len; - - // Note: self is only set if we are a c type. We emulate an instance method, - // hence we have to get the instance as 'first' argument - - // get instance and sha - PyObject* inst = 0; - if (!PyArg_ParseTuple(args, "Os#", &inst, &sha, &sha_len)) - return NULL; - - if (sha_len != 20) { - PyErr_SetString(PyExc_ValueError, "Sha is not 20 bytes long"); - return NULL; - } - - if( !inst){ - PyErr_SetString(PyExc_ValueError, "Cannot be called without self"); - return NULL; - } - - // read lo and hi bounds - PyObject* fanout_table = PyObject_GetAttrString(inst, "_fanout_table"); - if (!fanout_table){ - PyErr_SetString(PyExc_ValueError, "Couldn't obtain fanout table"); - return NULL; - } - - unsigned int lo = 0, hi = 0; - if (sha[0]){ - PyObject* item = PySequence_GetItem(fanout_table, (const Py_ssize_t)(sha[0]-1)); - lo = PyInt_AS_LONG(item); - Py_DECREF(item); - } - PyObject* item = PySequence_GetItem(fanout_table, (const Py_ssize_t)sha[0]); - hi = PyInt_AS_LONG(item); - Py_DECREF(item); - item = 0; - - Py_DECREF(fanout_table); - - // get sha query function - PyObject* get_sha = PyObject_GetAttrString(inst, "sha"); - if (!get_sha){ - PyErr_SetString(PyExc_ValueError, "Couldn't obtain sha method"); - return NULL; - } - - PyObject *sha_str = 0; - while (lo < hi) { - const int mid = (lo + hi)/2; - sha_str = PyObject_CallFunction(get_sha, "i", mid); - if (!sha_str) { - return NULL; - } - - // we really trust that string ... for speed - const int cmp = memcmp(PyString_AS_STRING(sha_str), sha, 20); - Py_DECREF(sha_str); - sha_str = 0; - - if (cmp < 0){ - lo = mid + 1; - } - else if (cmp > 0) { - hi = mid; - } - else { - Py_DECREF(get_sha); - return PyInt_FromLong(mid); - }// END handle comparison - }// END while lo < hi - - // nothing found, cleanup - Py_DECREF(get_sha); - Py_RETURN_NONE; -} - -static PyMethodDef py_fun[] = { - { "PackIndexFile_sha_to_index", (PyCFunction)PackIndexFile_sha_to_index, METH_VARARGS, "TODO" }, - { "connect_deltas", (PyCFunction)connect_deltas, METH_O, "See python implementation" }, - { "apply_delta", (PyCFunction)apply_delta, METH_VARARGS, "See python implementation" }, - { NULL, NULL, 0, NULL } -}; - -#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ -#define PyMODINIT_FUNC void -#endif -PyMODINIT_FUNC init_perf(void) -{ - PyObject *m; - - if (PyType_Ready(&DeltaChunkListType) < 0) - return; - - m = Py_InitModule3("_perf", py_fun, NULL); - if (m == NULL) - return; - - Py_INCREF(&DeltaChunkListType); - PyModule_AddObject(m, "DeltaChunkList", (PyObject *)&DeltaChunkListType); -} diff --git a/gitdb/fun.py b/gitdb/fun.py index ac9d993..8ca38c8 100644 --- a/gitdb/fun.py +++ b/gitdb/fun.py @@ -776,6 +776,6 @@ def is_equal_canonical_sha(canonical_length, match, sha1): try: - from _perf import connect_deltas + from gitdb_speedups._perf import connect_deltas except ImportError: pass diff --git a/gitdb/pack.py b/gitdb/pack.py index 2447455..20a4515 100644 --- a/gitdb/pack.py +++ b/gitdb/pack.py @@ -35,7 +35,7 @@ ) try: - from _perf import PackIndexFile_sha_to_index + from gitdb_speedups._perf import PackIndexFile_sha_to_index except ImportError: pass # END try c module diff --git a/gitdb/stream.py b/gitdb/stream.py index be95c11..2f4c12d 100644 --- a/gitdb/stream.py +++ b/gitdb/stream.py @@ -33,7 +33,7 @@ has_perf_mod = False PY26 = sys.version_info[:2] < (2, 7) try: - from _perf import apply_delta as c_apply_delta + from gitdb_speedups._perf import apply_delta as c_apply_delta has_perf_mod = True except ImportError: pass diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3c6e79c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 diff --git a/setup.py b/setup.py index c2f9f1e..eccb952 100755 --- a/setup.py +++ b/setup.py @@ -1,125 +1,45 @@ -#!/usr/bin/env python -from distutils.core import setup, Extension -from distutils.command.build_py import build_py -from distutils.command.build_ext import build_ext +from setuptools import setup -import os -import sys +# NOTE: This is currently duplicated from the gitdb.__init__ module, because +# that's just how you write a setup.py (nobody reads this stuff out of the +# module) -# wow, this is a mixed bag ... I am pretty upset about all of this ... -setuptools_build_py_module = None -try: - # don't pull it in if we don't have to - if 'setuptools' in sys.modules: - import setuptools.command.build_py as setuptools_build_py_module - from setuptools.command.build_ext import build_ext -except ImportError: - pass - - -class build_ext_nofail(build_ext): - - """Doesn't fail when build our optional extensions""" - - def run(self): - try: - build_ext.run(self) - except Exception: - print("Ignored failure when building extensions, pure python modules will be used instead") - # END ignore errors - - -def get_data_files(self): - """Can you feel the pain ? So, in python2.5 and python2.4 coming with maya, - the line dealing with the ``plen`` has a bug which causes it to truncate too much. - It is fixed in the system interpreters as they receive patches, and shows how - bad it is if something doesn't have proper unittests. - The code here is a plain copy of the python2.6 version which works for all. - - Generate list of '(package,src_dir,build_dir,filenames)' tuples""" - data = [] - if not self.packages: - return data - - # this one is just for the setup tools ! They don't iniitlialize this variable - # when they should, but do it on demand using this method.Its crazy - if hasattr(self, 'analyze_manifest'): - self.analyze_manifest() - # END handle setuptools ... - - for package in self.packages: - # Locate package source directory - src_dir = self.get_package_dir(package) - - # Compute package build directory - build_dir = os.path.join(*([self.build_lib] + package.split('.'))) - - # Length of path to strip from found files - plen = 0 - if src_dir: - plen = len(src_dir) + 1 - - # Strip directory from globbed filenames - filenames = [ - file[plen:] for file in self.find_data_files(package, src_dir) - ] - data.append((package, src_dir, build_dir, filenames)) - return data - -build_py.get_data_files = get_data_files -if setuptools_build_py_module: - setuptools_build_py_module.build_py._get_data_files = get_data_files -# END apply setuptools patch too - -# NOTE: This is currently duplicated from the gitdb.__init__ module, as we cannot -# satisfy the dependencies at installation time, unfortunately, due to inherent limitations -# of distutils, which cannot install the prerequesites of a package before the acutal package. __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" version_info = (0, 6, 4) __version__ = '.'.join(str(i) for i in version_info) -setup(cmdclass={'build_ext': build_ext_nofail}, - name="gitdb", - version=__version__, - description="Git Object Database", - author=__author__, - author_email=__contact__, - url=__homepage__, - packages=('gitdb', 'gitdb.db', 'gitdb.utils', 'gitdb.test'), - package_dir = {'gitdb': 'gitdb'}, - ext_modules=[Extension('gitdb._perf', ['gitdb/_fun.c', 'gitdb/_delta_apply.c'], include_dirs=['gitdb'])], - license = "BSD License", - zip_safe=False, - requires=('smmap (>=0.8.5)', ), - install_requires=('smmap >= 0.8.5'), - long_description = """GitDB is a pure-Python git object database""", - # See https://pypi.python.org/pypi?%3Aaction=list_classifiers - classifiers=[ - # Picked from - # http://pypi.python.org/pypi?:action=list_classifiers - #"Development Status :: 1 - Planning", - #"Development Status :: 2 - Pre-Alpha", - #"Development Status :: 3 - Alpha", - # "Development Status :: 4 - Beta", - "Development Status :: 5 - Production/Stable", - #"Development Status :: 6 - Mature", - #"Development Status :: 7 - Inactive", - "Environment :: Console", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Operating System :: POSIX", - "Operating System :: Microsoft :: Windows", - "Operating System :: MacOS :: MacOS X", - "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", -],) +setup( + name="gitdb", + version=__version__, + description="Git Object Database", + author=__author__, + author_email=__contact__, + url=__homepage__, + packages=('gitdb', 'gitdb.db', 'gitdb.utils', 'gitdb.test'), + license="BSD License", + zip_safe=False, + install_requires=['smmap >= 0.8.5'], + long_description="""GitDB is a pure-Python git object database""", + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: Microsoft :: Windows", + "Operating System :: MacOS :: MacOS X", + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.2", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + ] +)