Skip to content

Commit 0183c4d

Browse files
Vicent MartiEdward Thomson
authored andcommitted
path: Use UTF8 iteration for HFS chars
1 parent 65f4305 commit 0183c4d

File tree

3 files changed

+132
-81
lines changed

3 files changed

+132
-81
lines changed

src/path.c

Lines changed: 45 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,93 +1282,57 @@ GIT_INLINE(bool) verify_dospath(
12821282
component[last] != ':');
12831283
}
12841284

1285-
GIT_INLINE(bool) verify_dotgit_hfs(const char *component, size_t len)
1285+
static int32_t next_hfs_char(const char **in, size_t *len)
12861286
{
1287-
const unsigned char *c;
1288-
int git = 0, ign = 0;
1289-
unsigned char one, two;
1290-
1291-
while (len) {
1292-
switch (*(c = (const unsigned char *)component++)) {
1293-
case '.':
1294-
if (ign || git++ != 0)
1295-
return true;
1296-
break;
1297-
case 'g':
1298-
case 'G':
1299-
if (ign || git++ != 1)
1300-
return true;
1301-
break;
1302-
case 'i':
1303-
case 'I':
1304-
if (ign || git++ != 2)
1305-
return true;
1306-
break;
1307-
case 't':
1308-
case 'T':
1309-
if (ign || git++ != 3)
1310-
return true;
1311-
break;
1312-
1313-
case 0xe2:
1314-
case 0xef:
1315-
if (ign++ != 0)
1316-
return true;
1317-
one = *c;
1318-
break;
1319-
1320-
case 0x80:
1321-
case 0x81:
1322-
if (ign++ != 1 || one != 0xe2)
1323-
return true;
1324-
two = *c;
1325-
break;
1326-
1327-
case 0xbb:
1328-
if (ign++ != 1 || one != 0xef)
1329-
return true;
1330-
two = *c;
1331-
break;
1332-
1333-
case 0x8c:
1334-
case 0x8d:
1335-
case 0x8e:
1336-
case 0x8f:
1337-
if (ign != 2 || two != 0x80)
1338-
return true;
1339-
ign = 0;
1340-
break;
1341-
1342-
case 0xaa:
1343-
case 0xab:
1344-
case 0xac:
1345-
case 0xad:
1346-
case 0xae:
1347-
if (ign != 2 || (two != 0x80 && two != 0x81))
1348-
return true;
1349-
ign = 0;
1350-
break;
1351-
1352-
case 0xaf:
1353-
if (ign != 2 || two != 0x81)
1354-
return true;
1355-
ign = 0;
1356-
break;
1357-
1358-
case 0xbf:
1359-
if (ign != 2 || two != 0xbb)
1360-
return true;
1361-
ign = 0;
1362-
break;
1287+
while (*len) {
1288+
int32_t codepoint;
1289+
int cp_len = git__utf8_iterate((const uint8_t *)(*in), (int)(*len), &codepoint);
1290+
if (cp_len < 0)
1291+
return -1;
13631292

1364-
default:
1365-
return true;
1293+
(*in) += cp_len;
1294+
(*len) -= cp_len;
1295+
1296+
/* these code points are ignored completely */
1297+
switch (codepoint) {
1298+
case 0x200c: /* ZERO WIDTH NON-JOINER */
1299+
case 0x200d: /* ZERO WIDTH JOINER */
1300+
case 0x200e: /* LEFT-TO-RIGHT MARK */
1301+
case 0x200f: /* RIGHT-TO-LEFT MARK */
1302+
case 0x202a: /* LEFT-TO-RIGHT EMBEDDING */
1303+
case 0x202b: /* RIGHT-TO-LEFT EMBEDDING */
1304+
case 0x202c: /* POP DIRECTIONAL FORMATTING */
1305+
case 0x202d: /* LEFT-TO-RIGHT OVERRIDE */
1306+
case 0x202e: /* RIGHT-TO-LEFT OVERRIDE */
1307+
case 0x206a: /* INHIBIT SYMMETRIC SWAPPING */
1308+
case 0x206b: /* ACTIVATE SYMMETRIC SWAPPING */
1309+
case 0x206c: /* INHIBIT ARABIC FORM SHAPING */
1310+
case 0x206d: /* ACTIVATE ARABIC FORM SHAPING */
1311+
case 0x206e: /* NATIONAL DIGIT SHAPES */
1312+
case 0x206f: /* NOMINAL DIGIT SHAPES */
1313+
case 0xfeff: /* ZERO WIDTH NO-BREAK SPACE */
1314+
continue;
13661315
}
13671316

1368-
len--;
1317+
/* fold into lowercase -- this will only fold characters in
1318+
* the ASCII range, which is perfectly fine, because the
1319+
* git folder name can only be composed of ascii characters
1320+
*/
1321+
return tolower(codepoint);
13691322
}
1323+
return 0; /* NULL byte -- end of string */
1324+
}
1325+
1326+
static bool verify_dotgit_hfs(const char *path, size_t len)
1327+
{
1328+
if (next_hfs_char(&path, &len) != '.' ||
1329+
next_hfs_char(&path, &len) != 'g' ||
1330+
next_hfs_char(&path, &len) != 'i' ||
1331+
next_hfs_char(&path, &len) != 't' ||
1332+
next_hfs_char(&path, &len) != 0)
1333+
return true;
13701334

1371-
return (ign || git != 4);
1335+
return false;
13721336
}
13731337

13741338
GIT_INLINE(bool) verify_char(unsigned char c, unsigned int flags)

src/util.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,3 +664,79 @@ void git__insertsort_r(
664664
if (freeswap)
665665
git__free(swapel);
666666
}
667+
668+
static const int8_t utf8proc_utf8class[256] = {
669+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
670+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
671+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
672+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
673+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
674+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
675+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
676+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
677+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
678+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
679+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
680+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
681+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
682+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
683+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
684+
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
685+
};
686+
687+
int git__utf8_charlen(const uint8_t *str, int str_len)
688+
{
689+
int length, i;
690+
691+
length = utf8proc_utf8class[str[0]];
692+
if (!length)
693+
return -1;
694+
695+
if (str_len >= 0 && length > str_len)
696+
return -str_len;
697+
698+
for (i = 1; i < length; i++) {
699+
if ((str[i] & 0xC0) != 0x80)
700+
return -i;
701+
}
702+
703+
return length;
704+
}
705+
706+
int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst)
707+
{
708+
int length;
709+
int32_t uc = -1;
710+
711+
*dst = -1;
712+
length = git__utf8_charlen(str, str_len);
713+
if (length < 0)
714+
return -1;
715+
716+
switch (length) {
717+
case 1:
718+
uc = str[0];
719+
break;
720+
case 2:
721+
uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
722+
if (uc < 0x80) uc = -1;
723+
break;
724+
case 3:
725+
uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
726+
+ (str[2] & 0x3F);
727+
if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
728+
(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
729+
break;
730+
case 4:
731+
uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
732+
+ ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
733+
if (uc < 0x10000 || uc >= 0x110000) uc = -1;
734+
break;
735+
}
736+
737+
if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
738+
return -1;
739+
740+
*dst = uc;
741+
return length;
742+
}

src/util.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,17 @@ extern int git__date_rfc2822_fmt(char *out, size_t len, const git_time *date);
367367
*/
368368
extern size_t git__unescape(char *str);
369369

370+
/*
371+
* Iterate through an UTF-8 string, yielding one
372+
* codepoint at a time.
373+
*
374+
* @param str current position in the string
375+
* @param str_len size left in the string; -1 if the string is NULL-terminated
376+
* @param dst pointer where to store the current codepoint
377+
* @return length in bytes of the read codepoint; -1 if the codepoint was invalid
378+
*/
379+
extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
380+
370381
/*
371382
* Safely zero-out memory, making sure that the compiler
372383
* doesn't optimize away the operation.

0 commit comments

Comments
 (0)