Skip to content

Commit 906ce2a

Browse files
committed
InternPool: use sequential string indices instead of byte offsets
This allows more bytes to be referenced by a smaller index range. Closes ziglang#22867 Closes ziglang#25297 Closes ziglang#25339
1 parent 606c7bc commit 906ce2a

File tree

5 files changed

+102
-63
lines changed

5 files changed

+102
-63
lines changed

src/Compilation.zig

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3684,6 +3684,7 @@ const Header = extern struct {
36843684
items_len: u32,
36853685
extra_len: u32,
36863686
limbs_len: u32,
3687+
strings_len: u32,
36873688
string_bytes_len: u32,
36883689
tracked_insts_len: u32,
36893690
files_len: u32,
@@ -3732,7 +3733,8 @@ pub fn saveState(comp: *Compilation) !void {
37323733
.items_len = @intCast(local.mutate.items.len),
37333734
.extra_len = @intCast(local.mutate.extra.len),
37343735
.limbs_len = @intCast(local.mutate.limbs.len),
3735-
.string_bytes_len = @intCast(local.mutate.strings.len),
3736+
.strings_len = @intCast(local.mutate.strings.len),
3737+
.string_bytes_len = @intCast(local.mutate.string_bytes.len),
37363738
.tracked_insts_len = @intCast(local.mutate.tracked_insts.len),
37373739
.files_len = @intCast(local.mutate.files.len),
37383740
},
@@ -3775,8 +3777,11 @@ pub fn saveState(comp: *Compilation) !void {
37753777
addBuf(&bufs, @ptrCast(local.shared.items.view().items(.data)[0..pt_header.intern_pool.items_len]));
37763778
addBuf(&bufs, @ptrCast(local.shared.items.view().items(.tag)[0..pt_header.intern_pool.items_len]));
37773779
}
3780+
if (pt_header.intern_pool.strings_len > 0) {
3781+
addBuf(&bufs, @ptrCast(local.shared.strings.view().items(.@"0")[0..pt_header.intern_pool.strings_len]));
3782+
}
37783783
if (pt_header.intern_pool.string_bytes_len > 0) {
3779-
addBuf(&bufs, local.shared.strings.view().items(.@"0")[0..pt_header.intern_pool.string_bytes_len]);
3784+
addBuf(&bufs, local.shared.string_bytes.view().items(.@"0")[0..pt_header.intern_pool.string_bytes_len]);
37803785
}
37813786
if (pt_header.intern_pool.tracked_insts_len > 0) {
37823787
addBuf(&bufs, @ptrCast(local.shared.tracked_insts.view().items(.@"0")[0..pt_header.intern_pool.tracked_insts_len]));

src/InternPool.zig

Lines changed: 82 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ pub const TrackedInst = extern struct {
196196
pub fn wrap(unwrapped: Unwrapped, ip: *const InternPool) TrackedInst.Index {
197197
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
198198
assert(unwrapped.index <= ip.getIndexMask(u32));
199-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_32 |
199+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_32) |
200200
unwrapped.index);
201201
}
202202
};
@@ -494,7 +494,7 @@ pub const ComptimeUnit = extern struct {
494494
fn wrap(unwrapped: Unwrapped, ip: *const InternPool) ComptimeUnit.Id {
495495
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
496496
assert(unwrapped.index <= ip.getIndexMask(u32));
497-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_32 |
497+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_32) |
498498
unwrapped.index);
499499
}
500500
};
@@ -713,7 +713,7 @@ pub const Nav = struct {
713713
fn wrap(unwrapped: Unwrapped, ip: *const InternPool) Nav.Index {
714714
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
715715
assert(unwrapped.index <= ip.getIndexMask(u32));
716-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_32 |
716+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_32) |
717717
unwrapped.index);
718718
}
719719
};
@@ -1061,6 +1061,7 @@ const Local = struct {
10611061
extra: ListMutate,
10621062
limbs: ListMutate,
10631063
strings: ListMutate,
1064+
string_bytes: ListMutate,
10641065
tracked_insts: ListMutate,
10651066
files: ListMutate,
10661067
maps: ListMutate,
@@ -1075,6 +1076,7 @@ const Local = struct {
10751076
extra: Extra,
10761077
limbs: Limbs,
10771078
strings: Strings,
1079+
string_bytes: StringBytes,
10781080
tracked_insts: TrackedInsts,
10791081
files: List(File),
10801082
maps: Maps,
@@ -1098,7 +1100,8 @@ const Local = struct {
10981100
@sizeOf(u64) => List(struct { u64 }),
10991101
else => @compileError("unsupported host"),
11001102
};
1101-
const Strings = List(struct { u8 });
1103+
const Strings = List(struct { u32 });
1104+
const StringBytes = List(struct { u8 });
11021105
const TrackedInsts = List(struct { TrackedInst.MaybeLost });
11031106
const Maps = List(struct { FieldMap });
11041107
const Navs = List(Nav.Repr);
@@ -1428,17 +1431,27 @@ const Local = struct {
14281431
};
14291432
}
14301433

1434+
/// A list of offsets into `string_bytes` for each string.
1435+
pub fn getMutableStrings(local: *Local, gpa: Allocator) Strings.Mutable {
1436+
return .{
1437+
.gpa = gpa,
1438+
.arena = &local.mutate.arena,
1439+
.mutate = &local.mutate.strings,
1440+
.list = &local.shared.strings,
1441+
};
1442+
}
1443+
14311444
/// In order to store references to strings in fewer bytes, we copy all
14321445
/// string bytes into here. String bytes can be null. It is up to whomever
14331446
/// is referencing the data here whether they want to store both index and length,
14341447
/// thus allowing null bytes, or store only index, and use null-termination. The
1435-
/// `strings` array is agnostic to either usage.
1436-
pub fn getMutableStrings(local: *Local, gpa: Allocator) Strings.Mutable {
1448+
/// `strings_bytes` array is agnostic to either usage.
1449+
pub fn getMutableStringBytes(local: *Local, gpa: Allocator) StringBytes.Mutable {
14371450
return .{
14381451
.gpa = gpa,
14391452
.arena = &local.mutate.arena,
1440-
.mutate = &local.mutate.strings,
1441-
.list = &local.shared.strings,
1453+
.mutate = &local.mutate.string_bytes,
1454+
.list = &local.shared.string_bytes,
14421455
};
14431456
}
14441457

@@ -1611,7 +1624,7 @@ const Shard = struct {
16111624
};
16121625

16131626
fn getTidMask(ip: *const InternPool) u32 {
1614-
return (@as(u32, 1) << ip.tid_width) - 1;
1627+
return @shlExact(@as(u32, 1), ip.tid_width) - 1;
16151628
}
16161629

16171630
fn getIndexMask(ip: *const InternPool, comptime BackingInt: type) u32 {
@@ -1652,7 +1665,7 @@ pub const MapIndex = enum(u32) {
16521665
fn wrap(unwrapped: Unwrapped, ip: *const InternPool) MapIndex {
16531666
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
16541667
assert(unwrapped.index <= ip.getIndexMask(u32));
1655-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_32 |
1668+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_32) |
16561669
unwrapped.index);
16571670
}
16581671
};
@@ -1678,7 +1691,7 @@ pub const NamespaceIndex = enum(u32) {
16781691
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
16791692
assert(unwrapped.bucket_index <= ip.getIndexMask(u32) >> Local.namespaces_bucket_width);
16801693
assert(unwrapped.index <= Local.namespaces_bucket_mask);
1681-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_32 |
1694+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_32) |
16821695
unwrapped.bucket_index << Local.namespaces_bucket_width |
16831696
unwrapped.index);
16841697
}
@@ -1721,7 +1734,7 @@ pub const FileIndex = enum(u32) {
17211734
fn wrap(unwrapped: Unwrapped, ip: *const InternPool) FileIndex {
17221735
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
17231736
assert(unwrapped.index <= ip.getIndexMask(u32));
1724-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_32 |
1737+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_32) |
17251738
unwrapped.index);
17261739
}
17271740
};
@@ -1780,7 +1793,8 @@ pub const String = enum(u32) {
17801793
fn wrap(unwrapped: Unwrapped, ip: *const InternPool) String {
17811794
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
17821795
assert(unwrapped.index <= ip.getIndexMask(u32));
1783-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_32 | unwrapped.index);
1796+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_32) |
1797+
unwrapped.index);
17841798
}
17851799
};
17861800
fn unwrap(string: String, ip: *const InternPool) Unwrapped {
@@ -1791,9 +1805,11 @@ pub const String = enum(u32) {
17911805
}
17921806

17931807
fn toOverlongSlice(string: String, ip: *const InternPool) []const u8 {
1794-
const unwrapped_string = string.unwrap(ip);
1795-
const strings = ip.getLocalShared(unwrapped_string.tid).strings.acquire();
1796-
return strings.view().items(.@"0")[unwrapped_string.index..];
1808+
const unwrapped = string.unwrap(ip);
1809+
const local_shared = ip.getLocalShared(unwrapped.tid);
1810+
const strings = local_shared.strings.acquire().view().items(.@"0");
1811+
const string_bytes = local_shared.string_bytes.acquire().view().items(.@"0");
1812+
return string_bytes[strings[unwrapped.index]..];
17971813
}
17981814

17991815
const debug_state = InternPool.debug_state;
@@ -1848,12 +1864,18 @@ pub const NullTerminatedString = enum(u32) {
18481864
}
18491865

18501866
pub fn toSlice(string: NullTerminatedString, ip: *const InternPool) [:0]const u8 {
1851-
const overlong_slice = string.toString().toOverlongSlice(ip);
1852-
return overlong_slice[0..std.mem.indexOfScalar(u8, overlong_slice, 0).? :0];
1867+
const unwrapped = string.toString().unwrap(ip);
1868+
const local_shared = ip.getLocalShared(unwrapped.tid);
1869+
const strings = local_shared.strings.acquire().view().items(.@"0");
1870+
const string_bytes = local_shared.string_bytes.acquire().view().items(.@"0");
1871+
return string_bytes[strings[unwrapped.index] .. strings[unwrapped.index + 1] - 1 :0];
18531872
}
18541873

18551874
pub fn length(string: NullTerminatedString, ip: *const InternPool) u32 {
1856-
return @intCast(string.toSlice(ip).len);
1875+
const unwrapped = string.toString().unwrap(ip);
1876+
const local_shared = ip.getLocalShared(unwrapped.tid);
1877+
const strings = local_shared.strings.acquire().view().items(.@"0");
1878+
return strings[unwrapped.index + 1] - 1 - strings[unwrapped.index];
18571879
}
18581880

18591881
pub fn eqlSlice(string: NullTerminatedString, slice: []const u8, ip: *const InternPool) bool {
@@ -4767,7 +4789,8 @@ pub const Index = enum(u32) {
47674789
fn wrap(unwrapped: Unwrapped, ip: *const InternPool) Index {
47684790
assert(@intFromEnum(unwrapped.tid) <= ip.getTidMask());
47694791
assert(unwrapped.index <= ip.getIndexMask(u30));
4770-
return @enumFromInt(@as(u32, @intFromEnum(unwrapped.tid)) << ip.tid_shift_30 | unwrapped.index);
4792+
return @enumFromInt(@shlExact(@as(u32, @intFromEnum(unwrapped.tid)), ip.tid_shift_30) |
4793+
unwrapped.index);
47714794
}
47724795

47734796
pub fn getExtra(unwrapped: Unwrapped, ip: *const InternPool) Local.Extra {
@@ -6795,6 +6818,7 @@ pub fn init(ip: *InternPool, gpa: Allocator, available_threads: usize) !void {
67956818
.extra = .empty,
67966819
.limbs = .empty,
67976820
.strings = .empty,
6821+
.string_bytes = .empty,
67986822
.tracked_insts = .empty,
67996823
.files = .empty,
68006824
.maps = .empty,
@@ -6810,6 +6834,7 @@ pub fn init(ip: *InternPool, gpa: Allocator, available_threads: usize) !void {
68106834
.extra = .empty,
68116835
.limbs = .empty,
68126836
.strings = .empty,
6837+
.string_bytes = .empty,
68136838
.tracked_insts = .empty,
68146839
.files = .empty,
68156840
.maps = .empty,
@@ -6819,6 +6844,7 @@ pub fn init(ip: *InternPool, gpa: Allocator, available_threads: usize) !void {
68196844
.namespaces = .empty,
68206845
},
68216846
});
6847+
for (ip.locals) |*local| try local.getMutableStrings(gpa).append(.{0});
68226848

68236849
ip.tid_width = @intCast(std.math.log2_int_ceil(usize, used_threads));
68246850
ip.tid_shift_30 = if (single_threaded) 0 else 30 - ip.tid_width;
@@ -8523,30 +8549,30 @@ pub fn get(ip: *InternPool, gpa: Allocator, tid: Zcu.PerThread.Id, key: Key) All
85238549
}
85248550

85258551
if (child == .u8_type) bytes: {
8526-
const strings = ip.getLocal(tid).getMutableStrings(gpa);
8527-
const start = strings.mutate.len;
8528-
try strings.ensureUnusedCapacity(@intCast(len_including_sentinel + 1));
8552+
const string_bytes = ip.getLocal(tid).getMutableStringBytes(gpa);
8553+
const start = string_bytes.mutate.len;
8554+
try string_bytes.ensureUnusedCapacity(@intCast(len_including_sentinel + 1));
85298555
try extra.ensureUnusedCapacity(@typeInfo(Bytes).@"struct".fields.len);
85308556
switch (aggregate.storage) {
8531-
.bytes => |bytes| strings.appendSliceAssumeCapacity(.{bytes.toSlice(len, ip)}),
8557+
.bytes => |bytes| string_bytes.appendSliceAssumeCapacity(.{bytes.toSlice(len, ip)}),
85328558
.elems => |elems| for (elems[0..@intCast(len)]) |elem| switch (ip.indexToKey(elem)) {
85338559
.undef => {
8534-
strings.shrinkRetainingCapacity(start);
8560+
string_bytes.shrinkRetainingCapacity(start);
85358561
break :bytes;
85368562
},
8537-
.int => |int| strings.appendAssumeCapacity(.{@intCast(int.storage.u64)}),
8563+
.int => |int| string_bytes.appendAssumeCapacity(.{@intCast(int.storage.u64)}),
85388564
else => unreachable,
85398565
},
85408566
.repeated_elem => |elem| switch (ip.indexToKey(elem)) {
85418567
.undef => break :bytes,
85428568
.int => |int| @memset(
8543-
strings.addManyAsSliceAssumeCapacity(@intCast(len))[0],
8569+
string_bytes.addManyAsSliceAssumeCapacity(@intCast(len))[0],
85448570
@intCast(int.storage.u64),
85458571
),
85468572
else => unreachable,
85478573
},
85488574
}
8549-
if (sentinel != .none) strings.appendAssumeCapacity(.{
8575+
if (sentinel != .none) string_bytes.appendAssumeCapacity(.{
85508576
@intCast(ip.indexToKey(sentinel).int.storage.u64),
85518577
});
85528578
const string = try ip.getOrPutTrailingString(
@@ -11762,10 +11788,10 @@ pub fn getOrPutString(
1176211788
slice: []const u8,
1176311789
comptime embedded_nulls: EmbeddedNulls,
1176411790
) Allocator.Error!embedded_nulls.StringType() {
11765-
const strings = ip.getLocal(tid).getMutableStrings(gpa);
11766-
try strings.ensureUnusedCapacity(slice.len + 1);
11767-
strings.appendSliceAssumeCapacity(.{slice});
11768-
strings.appendAssumeCapacity(.{0});
11791+
const string_bytes = ip.getLocal(tid).getMutableStringBytes(gpa);
11792+
try string_bytes.ensureUnusedCapacity(slice.len + 1);
11793+
string_bytes.appendSliceAssumeCapacity(.{slice});
11794+
string_bytes.appendAssumeCapacity(.{0});
1176911795
return ip.getOrPutTrailingString(gpa, tid, @intCast(slice.len + 1), embedded_nulls);
1177011796
}
1177111797

@@ -11780,8 +11806,8 @@ pub fn getOrPutStringFmt(
1178011806
// ensure that references to strings in args do not get invalidated
1178111807
const format_z = format ++ .{0};
1178211808
const len: u32 = @intCast(std.fmt.count(format_z, args));
11783-
const strings = ip.getLocal(tid).getMutableStrings(gpa);
11784-
const slice = try strings.addManyAsSlice(len);
11809+
const string_bytes = ip.getLocal(tid).getMutableStringBytes(gpa);
11810+
const slice = try string_bytes.addManyAsSlice(len);
1178511811
assert((std.fmt.bufPrint(slice[0], format_z, args) catch unreachable).len == len);
1178611812
return ip.getOrPutTrailingString(gpa, tid, len, embedded_nulls);
1178711813
}
@@ -11805,21 +11831,27 @@ pub fn getOrPutTrailingString(
1180511831
len: u32,
1180611832
comptime embedded_nulls: EmbeddedNulls,
1180711833
) Allocator.Error!embedded_nulls.StringType() {
11808-
const strings = ip.getLocal(tid).getMutableStrings(gpa);
11809-
const start: u32 = @intCast(strings.mutate.len - len);
11810-
if (len > 0 and strings.view().items(.@"0")[strings.mutate.len - 1] == 0) {
11811-
strings.mutate.len -= 1;
11834+
const local = ip.getLocal(tid);
11835+
const strings = local.getMutableStrings(gpa);
11836+
try strings.ensureUnusedCapacity(1);
11837+
const string_bytes = local.getMutableStringBytes(gpa);
11838+
const start: u32 = @intCast(string_bytes.mutate.len - len);
11839+
if (len > 0 and string_bytes.view().items(.@"0")[string_bytes.mutate.len - 1] == 0) {
11840+
string_bytes.mutate.len -= 1;
1181211841
} else {
11813-
try strings.ensureUnusedCapacity(1);
11842+
try string_bytes.ensureUnusedCapacity(1);
1181411843
}
11815-
const key: []const u8 = strings.view().items(.@"0")[start..];
11816-
const value: embedded_nulls.StringType() =
11817-
@enumFromInt(@intFromEnum((String.Unwrapped{ .tid = tid, .index = start }).wrap(ip)));
11844+
const key: []const u8 = string_bytes.view().items(.@"0")[start..];
11845+
const value: embedded_nulls.StringType() = @enumFromInt(@intFromEnum((String.Unwrapped{
11846+
.tid = tid,
11847+
.index = strings.mutate.len - 1,
11848+
}).wrap(ip)));
1181811849
const has_embedded_null = std.mem.indexOfScalar(u8, key, 0) != null;
1181911850
switch (embedded_nulls) {
1182011851
.no_embedded_nulls => assert(!has_embedded_null),
1182111852
.maybe_embedded_nulls => if (has_embedded_null) {
11822-
strings.appendAssumeCapacity(.{0});
11853+
string_bytes.appendAssumeCapacity(.{0});
11854+
strings.appendAssumeCapacity(.{string_bytes.mutate.len});
1182311855
return value;
1182411856
},
1182511857
}
@@ -11837,7 +11869,7 @@ pub fn getOrPutTrailingString(
1183711869
const index = entry.acquire().unwrap() orelse break;
1183811870
if (entry.hash != hash) continue;
1183911871
if (!index.eqlSlice(key, ip)) continue;
11840-
strings.shrinkRetainingCapacity(start);
11872+
string_bytes.shrinkRetainingCapacity(start);
1184111873
return @enumFromInt(@intFromEnum(index));
1184211874
}
1184311875
shard.mutate.string_map.mutex.lock();
@@ -11853,19 +11885,20 @@ pub fn getOrPutTrailingString(
1185311885
const index = entry.acquire().unwrap() orelse break;
1185411886
if (entry.hash != hash) continue;
1185511887
if (!index.eqlSlice(key, ip)) continue;
11856-
strings.shrinkRetainingCapacity(start);
11888+
string_bytes.shrinkRetainingCapacity(start);
1185711889
return @enumFromInt(@intFromEnum(index));
1185811890
}
1185911891
defer shard.mutate.string_map.len += 1;
1186011892
const map_header = map.header().*;
1186111893
if (shard.mutate.string_map.len < map_header.capacity * 3 / 5) {
11862-
strings.appendAssumeCapacity(.{0});
11894+
string_bytes.appendAssumeCapacity(.{0});
11895+
strings.appendAssumeCapacity(.{string_bytes.mutate.len});
1186311896
const entry = &map.entries[map_index];
1186411897
entry.hash = hash;
1186511898
entry.release(@enumFromInt(@intFromEnum(value)));
1186611899
return value;
1186711900
}
11868-
const arena_state = &ip.getLocal(tid).mutate.arena;
11901+
const arena_state = &local.mutate.arena;
1186911902
var arena = arena_state.promote(gpa);
1187011903
defer arena_state.* = arena.state;
1187111904
const new_map_capacity = map_header.capacity * 2;
@@ -11901,7 +11934,8 @@ pub fn getOrPutTrailingString(
1190111934
map_index &= new_map_mask;
1190211935
if (map.entries[map_index].value == .none) break;
1190311936
}
11904-
strings.appendAssumeCapacity(.{0});
11937+
string_bytes.appendAssumeCapacity(.{0});
11938+
strings.appendAssumeCapacity(.{string_bytes.mutate.len});
1190511939
map.entries[map_index] = .{
1190611940
.value = @enumFromInt(@intFromEnum(value)),
1190711941
.hash = hash,

0 commit comments

Comments
 (0)