Skip to content

Commit 8fe8cfd

Browse files
authored
Use string length diff heuristic to skip Levenshtein Algo (RsyncProject#369)
When using the --fuzzy option to try and find close matches locally, the edit distance algorithm used is O(N^2), which can get painful on CPU constrained systems when working in folders with tens of thousands of files in it. The lower bound on the calculated Levenshtein distance is the difference of the two strings being compared, so if that difference is larger than the current best match, the calculation of the exact edit distance between the two strings can be skipped. Testing on the OpenSUSE package repo has shown a 50% reduction in the CPU time required to plan the rsync transaction.
1 parent 7a2dbf7 commit 8fe8cfd

File tree

2 files changed

+14
-4
lines changed

2 files changed

+14
-4
lines changed

generator.c

+6-3
Original file line numberDiff line numberDiff line change
@@ -875,9 +875,12 @@ static struct file_struct *find_fuzzy(struct file_struct *file, struct file_list
875875
len = strlen(name);
876876
suf = find_filename_suffix(name, len, &suf_len);
877877

878-
dist = fuzzy_distance(name, len, fname, fname_len);
879-
/* Add some extra weight to how well the suffixes match. */
880-
dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len) * 10;
878+
dist = fuzzy_distance(name, len, fname, fname_len, lowest_dist);
879+
/* Add some extra weight to how well the suffixes match unless we've already disqualified
880+
* this file based on a heuristic. */
881+
if (dist < 0xFFFF0000U) {
882+
dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len, 0xFFFF0000U) * 10;
883+
}
881884
if (DEBUG_GTE(FUZZY, 2)) {
882885
rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n",
883886
f_name(fp, NULL), (int)(dist>>16), (int)(dist&0xFFFF));

util1.c

+8-1
Original file line numberDiff line numberDiff line change
@@ -1487,12 +1487,19 @@ const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
14871487

14881488
#define UNIT (1 << 16)
14891489

1490-
uint32 fuzzy_distance(const char *s1, unsigned len1, const char *s2, unsigned len2)
1490+
uint32 fuzzy_distance(const char *s1, unsigned len1, const char *s2, unsigned len2, uint32 upperlimit)
14911491
{
14921492
uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
14931493
int32 cost;
14941494
unsigned i1, i2;
14951495

1496+
/* Check to see if the Levenshtein distance must be greater than the
1497+
* upper limit defined by the previously found lowest distance using
1498+
* the heuristic that the Levenshtein distance is greater than the
1499+
* difference in length of the two strings */
1500+
if ((len1 > len2 ? len1 - len2 : len2 - len1) * UNIT > upperlimit)
1501+
return 0xFFFFU * UNIT + 1;
1502+
14961503
if (!len1 || !len2) {
14971504
if (!len1) {
14981505
s1 = s2;

0 commit comments

Comments
 (0)