Skip to content

Commit e68e33f

Browse files
author
Vicent Martí
committed
Merge pull request libgit2#1233 from arrbee/file-similarity-metric
Add file similarity scoring to diff rename/copy detection
2 parents 9f9477d + 1be4ba9 commit e68e33f

34 files changed

+1409
-297
lines changed

include/git2/diff.h

Lines changed: 81 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,9 @@ typedef enum {
8888
GIT_DIFF_INCLUDE_UNTRACKED = (1 << 8),
8989
/** Include unmodified files in the diff list */
9090
GIT_DIFF_INCLUDE_UNMODIFIED = (1 << 9),
91-
/** Even with the GIT_DIFF_INCLUDE_UNTRACKED flag, when an untracked
92-
* directory is found, only a single entry for the directory is added
93-
* to the diff list; with this flag, all files under the directory will
94-
* be included, too.
91+
/** Even with GIT_DIFF_INCLUDE_UNTRACKED, an entire untracked directory
92+
* will be marked with only a single entry in the diff list; this flag
93+
* adds all files under the directory as UNTRACKED entries, too.
9594
*/
9695
GIT_DIFF_RECURSE_UNTRACKED_DIRS = (1 << 10),
9796
/** If the pathspec is set in the diff options, this flags means to
@@ -120,6 +119,11 @@ typedef enum {
120119
GIT_DIFF_INCLUDE_TYPECHANGE_TREES = (1 << 16),
121120
/** Ignore file mode changes */
122121
GIT_DIFF_IGNORE_FILEMODE = (1 << 17),
122+
/** Even with GIT_DIFF_INCLUDE_IGNORED, an entire ignored directory
123+
* will be marked with only a single entry in the diff list; this flag
124+
* adds all files under the directory as IGNORED entries, too.
125+
*/
126+
GIT_DIFF_RECURSE_IGNORED_DIRS = (1 << 10),
123127
} git_diff_option_t;
124128

125129
/**
@@ -133,20 +137,18 @@ typedef enum {
133137
typedef struct git_diff_list git_diff_list;
134138

135139
/**
136-
* Flags for the file object on each side of a diff.
140+
* Flags for the delta object and the file objects on each side.
137141
*
138-
* Note: most of these flags are just for **internal** consumption by
139-
* libgit2, but some of them may be interesting to external users.
142+
* These flags are used for both the `flags` value of the `git_diff_delta`
143+
* and the flags for the `git_diff_file` objects representing the old and
144+
* new sides of the delta. Values outside of this public range should be
145+
* considered reserved for internal or future use.
140146
*/
141147
typedef enum {
142-
GIT_DIFF_FILE_VALID_OID = (1 << 0), /** `oid` value is known correct */
143-
GIT_DIFF_FILE_FREE_PATH = (1 << 1), /** `path` is allocated memory */
144-
GIT_DIFF_FILE_BINARY = (1 << 2), /** should be considered binary data */
145-
GIT_DIFF_FILE_NOT_BINARY = (1 << 3), /** should be considered text data */
146-
GIT_DIFF_FILE_FREE_DATA = (1 << 4), /** internal file data is allocated */
147-
GIT_DIFF_FILE_UNMAP_DATA = (1 << 5), /** internal file data is mmap'ed */
148-
GIT_DIFF_FILE_NO_DATA = (1 << 6), /** file data should not be loaded */
149-
} git_diff_file_flag_t;
148+
GIT_DIFF_FLAG_BINARY = (1 << 0), /** file(s) treated as binary data */
149+
GIT_DIFF_FLAG_NOT_BINARY = (1 << 1), /** file(s) treated as text data */
150+
GIT_DIFF_FLAG_VALID_OID = (1 << 2), /** `oid` value is known correct */
151+
} git_diff_flag_t;
150152

151153
/**
152154
* What type of change is described by a git_diff_delta?
@@ -186,18 +188,17 @@ typedef enum {
186188
*
187189
* `size` is the size of the entry in bytes.
188190
*
189-
* `flags` is a combination of the `git_diff_file_flag_t` types, but those
190-
* are largely internal values.
191+
* `flags` is a combination of the `git_diff_flag_t` types
191192
*
192193
* `mode` is, roughly, the stat() `st_mode` value for the item. This will
193194
* be restricted to one of the `git_filemode_t` values.
194195
*/
195196
typedef struct {
196-
git_oid oid;
197+
git_oid oid;
197198
const char *path;
198-
git_off_t size;
199-
unsigned int flags;
200-
uint16_t mode;
199+
git_off_t size;
200+
uint32_t flags;
201+
uint16_t mode;
201202
} git_diff_file;
202203

203204
/**
@@ -219,16 +220,17 @@ typedef struct {
219220
*
220221
* Under some circumstances, in the name of efficiency, not all fields will
221222
* be filled in, but we generally try to fill in as much as possible. One
222-
* example is that the "binary" field will not examine file contents if you
223-
* do not pass in hunk and/or line callbacks to the diff foreach iteration
224-
* function. It will just use the git attributes for those files.
223+
* example is that the "flags" field may not have either the `BINARY` or the
224+
* `NOT_BINARY` flag set to avoid examining file contents if you do not pass
225+
* in hunk and/or line callbacks to the diff foreach iteration function. It
226+
* will just use the git attributes for those files.
225227
*/
226228
typedef struct {
227229
git_diff_file old_file;
228230
git_diff_file new_file;
229231
git_delta_t status;
230-
unsigned int similarity; /**< for RENAMED and COPIED, value 0-100 */
231-
int binary;
232+
uint32_t similarity; /**< for RENAMED and COPIED, value 0-100 */
233+
uint32_t flags;
232234
} git_diff_delta;
233235

234236
/**
@@ -377,7 +379,7 @@ typedef struct git_diff_patch git_diff_patch;
377379
typedef enum {
378380
/** look for renames? (`--find-renames`) */
379381
GIT_DIFF_FIND_RENAMES = (1 << 0),
380-
/** consider old size of modified for renames? (`--break-rewrites=N`) */
382+
/** consider old side of modified for renames? (`--break-rewrites=N`) */
381383
GIT_DIFF_FIND_RENAMES_FROM_REWRITES = (1 << 1),
382384

383385
/** look for copies? (a la `--find-copies`) */
@@ -387,10 +389,49 @@ typedef enum {
387389

388390
/** split large rewrites into delete/add pairs (`--break-rewrites=/M`) */
389391
GIT_DIFF_FIND_AND_BREAK_REWRITES = (1 << 4),
392+
393+
/** turn on all finding features */
394+
GIT_DIFF_FIND_ALL = (0x1f),
395+
396+
/** measure similarity ignoring leading whitespace (default) */
397+
GIT_DIFF_FIND_IGNORE_LEADING_WHITESPACE = 0,
398+
/** measure similarity ignoring all whitespace */
399+
GIT_DIFF_FIND_IGNORE_WHITESPACE = (1 << 6),
400+
/** measure similarity including all data */
401+
GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE = (1 << 7),
390402
} git_diff_find_t;
391403

404+
/**
405+
* Pluggable similarity metric
406+
*/
407+
typedef struct {
408+
int (*file_signature)(
409+
void **out, const git_diff_file *file,
410+
const char *fullpath, void *payload);
411+
int (*buffer_signature)(
412+
void **out, const git_diff_file *file,
413+
const char *buf, size_t buflen, void *payload);
414+
void (*free_signature)(void *sig, void *payload);
415+
int (*similarity)(int *score, void *siga, void *sigb, void *payload);
416+
void *payload;
417+
} git_diff_similarity_metric;
418+
392419
/**
393420
* Control behavior of rename and copy detection
421+
*
422+
* These options mostly mimic parameters that can be passed to git-diff.
423+
*
424+
* - `rename_threshold` is the same as the -M option with a value
425+
* - `copy_threshold` is the same as the -C option with a value
426+
* - `rename_from_rewrite_threshold` matches the top of the -B option
427+
* - `break_rewrite_threshold` matches the bottom of the -B option
428+
* - `target_limit` matches the -l option
429+
*
430+
* The `metric` option allows you to plug in a custom similarity metric.
431+
* Set it to NULL for the default internal metric which is based on sampling
432+
* hashes of ranges of data in the file. The default metric is a pretty
433+
* good similarity approximation that should work fairly well for both text
434+
* and binary data, and is pretty fast with fixed memory overhead.
394435
*/
395436
typedef struct {
396437
unsigned int version;
@@ -411,6 +452,9 @@ typedef struct {
411452
* the `diff.renameLimit` config) (default 200)
412453
*/
413454
unsigned int target_limit;
455+
456+
/** Pluggable similarity metric; pass NULL to use internal metric */
457+
git_diff_similarity_metric *metric;
414458
} git_diff_find_options;
415459

416460
#define GIT_DIFF_FIND_OPTIONS_VERSION 1
@@ -856,11 +900,12 @@ GIT_EXTERN(int) git_diff_patch_to_str(
856900
*
857901
* NULL is allowed for either `old_blob` or `new_blob` and will be treated
858902
* as an empty blob, with the `oid` set to NULL in the `git_diff_file` data.
903+
* Passing NULL for both blobs is a noop; no callbacks will be made at all.
859904
*
860-
* We do run a binary content check on the two blobs and if either of the
861-
* blobs looks like binary data, the `git_diff_delta` binary attribute will
862-
* be set to 1 and no call to the hunk_cb nor line_cb will be made (unless
863-
* you pass `GIT_DIFF_FORCE_TEXT` of course).
905+
* We do run a binary content check on the blob content and if either blob
906+
* looks like binary data, the `git_diff_delta` binary attribute will be set
907+
* to 1 and no call to the hunk_cb nor line_cb will be made (unless you pass
908+
* `GIT_DIFF_FORCE_TEXT` of course).
864909
*
865910
* @return 0 on success, GIT_EUSER on non-zero callback, or error code
866911
*/
@@ -880,6 +925,11 @@ GIT_EXTERN(int) git_diff_blobs(
880925
* so the `git_diff_file` parameters to the callbacks will be faked a la the
881926
* rules for `git_diff_blobs()`.
882927
*
928+
* Passing NULL for `old_blob` will be treated as an empty blob (i.e. the
929+
* `file_cb` will be invoked with GIT_DELTA_ADDED and the diff will be the
930+
* entire content of the buffer added). Passing NULL to the buffer will do
931+
* the reverse, with GIT_DELTA_REMOVED and blob content removed.
932+
*
883933
* @return 0 on success, GIT_EUSER on non-zero callback, or error code
884934
*/
885935
GIT_EXTERN(int) git_diff_blob_to_buffer(

src/checkout.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ static int checkout_notify(
7878
git_oid_cpy(&wdfile.oid, &wditem->oid);
7979
wdfile.path = wditem->path;
8080
wdfile.size = wditem->file_size;
81-
wdfile.flags = GIT_DIFF_FILE_VALID_OID;
81+
wdfile.flags = GIT_DIFF_FLAG_VALID_OID;
8282
wdfile.mode = wditem->mode;
8383

8484
workdir = &wdfile;

src/diff.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,11 @@ static int diff_delta__from_one(
9292
git_oid_cpy(&delta->new_file.oid, &entry->oid);
9393
}
9494

95-
delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID;
95+
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
9696

9797
if (delta->status == GIT_DELTA_DELETED ||
9898
!git_oid_iszero(&delta->new_file.oid))
99-
delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID;
99+
delta->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
100100

101101
notify_res = diff_notify(diff, delta, matched_pathspec);
102102

@@ -142,7 +142,7 @@ static int diff_delta__from_two(
142142
git_oid_cpy(&delta->old_file.oid, &old_entry->oid);
143143
delta->old_file.size = old_entry->file_size;
144144
delta->old_file.mode = old_mode;
145-
delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID;
145+
delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
146146

147147
git_oid_cpy(&delta->new_file.oid, &new_entry->oid);
148148
delta->new_file.size = new_entry->file_size;
@@ -156,7 +156,7 @@ static int diff_delta__from_two(
156156
}
157157

158158
if (new_oid || !git_oid_iszero(&new_entry->oid))
159-
delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID;
159+
delta->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
160160

161161
notify_res = diff_notify(diff, delta, matched_pathspec);
162162

src/diff.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,14 @@ enum {
2828
GIT_DIFFCAPS_USE_DEV = (1 << 4), /* use st_dev? */
2929
};
3030

31-
#define GIT_DELTA__TO_DELETE 10
32-
#define GIT_DELTA__TO_SPLIT 11
31+
enum {
32+
GIT_DIFF_FLAG__FREE_PATH = (1 << 7), /* `path` is allocated memory */
33+
GIT_DIFF_FLAG__FREE_DATA = (1 << 8), /* internal file data is allocated */
34+
GIT_DIFF_FLAG__UNMAP_DATA = (1 << 9), /* internal file data is mmap'ed */
35+
GIT_DIFF_FLAG__NO_DATA = (1 << 10), /* file data should not be loaded */
36+
GIT_DIFF_FLAG__TO_DELETE = (1 << 11), /* delete entry during rename det. */
37+
GIT_DIFF_FLAG__TO_SPLIT = (1 << 12), /* split entry during rename det. */
38+
};
3339

3440
struct git_diff_list {
3541
git_refcount rc;

0 commit comments

Comments
 (0)