Skip to content

Commit 50f3631

Browse files
committed
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary: This is intended to replace the current YAML format for general use. It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML: llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M It's also simpler/faster to read and write. The format is a RIFF container (chunks of (type, size, data)) with: - a compressed string table - simple binary encoding of symbols (with varints for compactness) It can be extended to include occurrences, Dex posting lists, etc. There's no rich backwards-compatibility scheme, but a version number is included so we can detect incompatible files and do ad-hoc back-compat. Alternatives considered: - compressed YAML or JSON: bulky and slow to load - llvm bitstream: confusing model and libraries are hard to use. My attempt produced slightly larger files, and the code was longer and slower. - protobuf or similar: would be really nice (esp for back-compat) but the dependency is a big hassle - ad-hoc binary format without a container: it seems clear we're going to add posting lists and occurrences here, and that they will benefit from sharing a string table. The container makes it easy to debug these pieces in isolation, and make them optional. Reviewers: ioeric Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits Differential Revision: https://reviews.llvm.org/D51585 llvm-svn: 341375
1 parent cc8b507 commit 50f3631

14 files changed

+848
-132
lines changed

clang-tools-extra/clangd/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ add_clang_library(clangDaemon
2929
Protocol.cpp
3030
ProtocolHandlers.cpp
3131
Quality.cpp
32+
RIFF.cpp
3233
SourceCode.cpp
3334
Threading.cpp
3435
Trace.cpp
@@ -41,6 +42,7 @@ add_clang_library(clangDaemon
4142
index/Index.cpp
4243
index/MemIndex.cpp
4344
index/Merge.cpp
45+
index/Serialization.cpp
4446
index/SymbolCollector.cpp
4547
index/SymbolYAML.cpp
4648

clang-tools-extra/clangd/RIFF.cpp

+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
//===--- RIFF.cpp - Binary container file format --------------------------===//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
10+
#include "RIFF.h"
11+
#include "llvm/Support/Endian.h"
12+
13+
using namespace llvm;
14+
namespace clang {
15+
namespace clangd {
16+
namespace riff {
17+
18+
static Error makeError(const char *Msg) {
19+
return createStringError(inconvertibleErrorCode(), Msg);
20+
}
21+
22+
Expected<Chunk> readChunk(StringRef &Stream) {
23+
if (Stream.size() < 8)
24+
return makeError("incomplete chunk header");
25+
Chunk C;
26+
std::copy(Stream.begin(), Stream.begin() + 4, C.ID.begin());
27+
Stream = Stream.drop_front(4);
28+
uint32_t Len = support::endian::read32le(Stream.take_front(4).begin());
29+
Stream = Stream.drop_front(4);
30+
if (Stream.size() < Len)
31+
return makeError("truncated chunk");
32+
C.Data = Stream.take_front(Len);
33+
Stream = Stream.drop_front(Len);
34+
if (Len % 2 & !Stream.empty()) { // Skip padding byte.
35+
if (Stream.front())
36+
return makeError("nonzero padding byte");
37+
Stream = Stream.drop_front();
38+
}
39+
return C;
40+
};
41+
42+
raw_ostream &operator<<(raw_ostream &OS, const Chunk &C) {
43+
OS.write(C.ID.begin(), C.ID.size());
44+
char Size[4];
45+
llvm::support::endian::write32le(Size, C.Data.size());
46+
OS.write(Size, sizeof(Size));
47+
OS << C.Data;
48+
if (C.Data.size() % 2)
49+
OS.write(0);
50+
return OS;
51+
}
52+
53+
llvm::Expected<File> readFile(llvm::StringRef Stream) {
54+
auto RIFF = readChunk(Stream);
55+
if (!RIFF)
56+
return RIFF.takeError();
57+
if (RIFF->ID != fourCC("RIFF"))
58+
return makeError("not a RIFF container");
59+
if (RIFF->Data.size() < 4)
60+
return makeError("RIFF chunk too short");
61+
File F;
62+
std::copy(RIFF->Data.begin(), RIFF->Data.begin() + 4, F.Type.begin());
63+
for (llvm::StringRef Body = RIFF->Data.drop_front(4); !Body.empty();)
64+
if (auto Chunk = readChunk(Body)) {
65+
F.Chunks.push_back(*Chunk);
66+
} else
67+
return Chunk.takeError();
68+
return F;
69+
}
70+
71+
raw_ostream &operator<<(raw_ostream &OS, const File &F) {
72+
// To avoid copies, we serialize the outer RIFF chunk "by hand".
73+
size_t DataLen = 4; // Predict length of RIFF chunk data.
74+
for (const auto &C : F.Chunks)
75+
DataLen += 4 + 4 + C.Data.size() + (C.Data.size() % 2);
76+
OS << "RIFF";
77+
char Size[4];
78+
llvm::support::endian::write32le(Size, DataLen);
79+
OS.write(Size, sizeof(Size));
80+
OS.write(F.Type.begin(), F.Type.size());
81+
for (const auto &C : F.Chunks)
82+
OS << C;
83+
return OS;
84+
}
85+
86+
} // namespace riff
87+
} // namespace clangd
88+
} // namespace clang

clang-tools-extra/clangd/RIFF.h

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
//===--- RIFF.h - Binary container file format -------------------*- C++-*-===//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// Tools for reading and writing data in RIFF containers.
11+
//
12+
// A chunk consists of:
13+
// - ID : char[4]
14+
// - Length : uint32
15+
// - Data : byte[Length]
16+
// - Padding : byte[Length % 2]
17+
// The semantics of a chunk's Data are determined by its ID.
18+
// The format makes it easy to skip over uninteresting or unknown chunks.
19+
//
20+
// A RIFF file is a single chunk with ID "RIFF". Its Data is:
21+
// - Type : char[4]
22+
// - Chunks : chunk[]
23+
//
24+
// This means that a RIFF file consists of:
25+
// - "RIFF" : char[4]
26+
// - File length - 8 : uint32
27+
// - File type : char[4]
28+
// - Chunks : chunk[]
29+
//
30+
//===----------------------------------------------------------------------===//
31+
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_RIFF_H
32+
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_RIFF_H
33+
#include "llvm/ADT/StringRef.h"
34+
#include "llvm/Support/Error.h"
35+
#include "llvm/Support/ScopedPrinter.h"
36+
#include <array>
37+
38+
namespace clang {
39+
namespace clangd {
40+
namespace riff {
41+
42+
// A FourCC identifies a chunk in a file, or the type of file itself.
43+
using FourCC = std::array<char, 4>;
44+
// Get a FourCC from a string literal, e.g. fourCC("RIFF").
45+
inline constexpr FourCC fourCC(const char (&Literal)[5]) {
46+
return FourCC{{Literal[0], Literal[1], Literal[2], Literal[3]}};
47+
}
48+
// A chunk is a section in a RIFF container.
49+
struct Chunk {
50+
FourCC ID;
51+
llvm::StringRef Data;
52+
};
53+
inline bool operator==(const Chunk &L, const Chunk &R) {
54+
return std::tie(L.ID, L.Data) == std::tie(R.ID, R.Data);
55+
}
56+
// A File is a RIFF container, which is a typed chunk sequence.
57+
struct File {
58+
FourCC Type;
59+
std::vector<Chunk> Chunks;
60+
};
61+
inline bool operator==(const File &L, const File &R) {
62+
return std::tie(L.Type, L.Chunks) == std::tie(R.Type, R.Chunks);
63+
}
64+
65+
// Reads a single chunk from the start of Stream.
66+
// Stream is updated to exclude the consumed chunk.
67+
llvm::Expected<Chunk> readChunk(llvm::StringRef &Stream);
68+
69+
// Serialize a single chunk to OS.
70+
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Chunk &);
71+
72+
// Parses a RIFF file consisting of a single RIFF chunk.
73+
llvm::Expected<File> readFile(llvm::StringRef Stream);
74+
75+
// Serialize a RIFF file (i.e. a single RIFF chunk) to OS.
76+
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const File &);
77+
78+
} // namespace riff
79+
} // namespace clangd
80+
} // namespace clang
81+
#endif

clang-tools-extra/clangd/global-symbol-builder/GlobalSymbolBuilderMain.cpp

+25-7
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,16 @@
77
//
88
//===----------------------------------------------------------------------===//
99
//
10-
// GlobalSymbolBuilder is a tool to generate YAML-format symbols across the
11-
// whole project. This tools is for **experimental** only. Don't use it in
12-
// production code.
10+
// GlobalSymbolBuilder is a tool to extract symbols from a whole project.
11+
// This tool is **experimental** only. Don't use it in production code.
1312
//
1413
//===----------------------------------------------------------------------===//
1514

15+
#include "RIFF.h"
1616
#include "index/CanonicalIncludes.h"
1717
#include "index/Index.h"
1818
#include "index/Merge.h"
19+
#include "index/Serialization.h"
1920
#include "index/SymbolCollector.h"
2021
#include "index/SymbolYAML.h"
2122
#include "clang/Frontend/CompilerInstance.h"
@@ -59,6 +60,14 @@ static llvm::cl::opt<bool> MergeOnTheFly(
5960
"MapReduce."),
6061
llvm::cl::init(true), llvm::cl::Hidden);
6162

63+
enum class Format { YAML, Binary };
64+
static llvm::cl::opt<Format>
65+
Format("format", llvm::cl::desc("Format of the index to be written"),
66+
llvm::cl::values(
67+
clEnumValN(Format::YAML, "yaml", "human-readable YAML format"),
68+
clEnumValN(Format::Binary, "binary", "binary RIFF format")),
69+
llvm::cl::init(Format::YAML));
70+
6271
/// Responsible for aggregating symbols from each processed file and producing
6372
/// the final results. All methods in this class must be thread-safe,
6473
/// 'consumeSymbols' may be called from multiple threads.
@@ -210,8 +219,8 @@ int main(int argc, const char **argv) {
210219
llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
211220

212221
const char *Overview = R"(
213-
This is an **experimental** tool to generate YAML-format project-wide symbols
214-
for clangd (global code completion). It would be changed and deprecated
222+
This is an **experimental** tool to extract symbols from a whole project
223+
for clangd (global code completion). It will be changed and deprecated
215224
eventually. Don't use it in production code!
216225
217226
Example usage for building index for the whole project using CMake compile
@@ -262,7 +271,16 @@ int main(int argc, const char **argv) {
262271
}
263272
// Reduce phase: combine symbols with the same IDs.
264273
auto UniqueSymbols = Consumer->mergeResults();
265-
// Output phase: emit YAML for result symbols.
266-
SymbolsToYAML(UniqueSymbols, llvm::outs());
274+
// Output phase: emit result symbols.
275+
switch (clang::clangd::Format) {
276+
case clang::clangd::Format::YAML:
277+
SymbolsToYAML(UniqueSymbols, llvm::outs());
278+
break;
279+
case clang::clangd::Format::Binary: {
280+
clang::clangd::IndexFileOut Out;
281+
Out.Symbols = &UniqueSymbols;
282+
llvm::outs() << Out;
283+
}
284+
}
267285
return 0;
268286
}

clang-tools-extra/clangd/index/Index.cpp

+15-31
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "Index.h"
1111
#include "llvm/ADT/StringExtras.h"
1212
#include "llvm/ADT/StringRef.h"
13+
#include "llvm/Support/Error.h"
1314
#include "llvm/Support/SHA1.h"
1415
#include "llvm/Support/raw_ostream.h"
1516

@@ -28,21 +29,20 @@ SymbolID::SymbolID(StringRef USR)
2829
: HashValue(SHA1::hash(arrayRefFromStringRef(USR))) {}
2930

3031
raw_ostream &operator<<(raw_ostream &OS, const SymbolID &ID) {
31-
OS << toHex(toStringRef(ID.HashValue));
32-
return OS;
32+
return OS << toHex(ID.raw());
3333
}
3434

35-
std::string SymbolID::str() const {
36-
std::string ID;
37-
llvm::raw_string_ostream OS(ID);
38-
OS << *this;
39-
return OS.str();
35+
SymbolID SymbolID::fromRaw(llvm::StringRef Raw) {
36+
SymbolID ID;
37+
assert(Raw.size() == RawSize);
38+
memcpy(ID.HashValue.data(), Raw.data(), RawSize);
39+
return ID;
4040
}
4141

42+
std::string SymbolID::str() const { return toHex(raw()); }
43+
4244
void operator>>(StringRef Str, SymbolID &ID) {
43-
std::string HexString = fromHex(Str);
44-
assert(HexString.size() == ID.HashValue.size());
45-
std::copy(HexString.begin(), HexString.end(), ID.HashValue.begin());
45+
ID = SymbolID::fromRaw(fromHex(Str));
4646
}
4747

4848
raw_ostream &operator<<(raw_ostream &OS, SymbolOrigin O) {
@@ -78,34 +78,18 @@ SymbolSlab::const_iterator SymbolSlab::find(const SymbolID &ID) const {
7878
}
7979

8080
// Copy the underlying data of the symbol into the owned arena.
81-
static void own(Symbol &S, llvm::UniqueStringSaver &Strings,
82-
BumpPtrAllocator &Arena) {
83-
// Intern replaces V with a reference to the same string owned by the arena.
84-
auto Intern = [&](StringRef &V) { V = Strings.save(V); };
85-
86-
// We need to copy every StringRef field onto the arena.
87-
Intern(S.Name);
88-
Intern(S.Scope);
89-
Intern(S.CanonicalDeclaration.FileURI);
90-
Intern(S.Definition.FileURI);
91-
92-
Intern(S.Signature);
93-
Intern(S.CompletionSnippetSuffix);
94-
95-
Intern(S.Documentation);
96-
Intern(S.ReturnType);
97-
for (auto &I : S.IncludeHeaders)
98-
Intern(I.IncludeHeader);
81+
static void own(Symbol &S, llvm::UniqueStringSaver &Strings) {
82+
visitStrings(S, [&](StringRef &V) { V = Strings.save(V); });
9983
}
10084

10185
void SymbolSlab::Builder::insert(const Symbol &S) {
10286
auto R = SymbolIndex.try_emplace(S.ID, Symbols.size());
10387
if (R.second) {
10488
Symbols.push_back(S);
105-
own(Symbols.back(), UniqueStrings, Arena);
89+
own(Symbols.back(), UniqueStrings);
10690
} else {
10791
auto &Copy = Symbols[R.first->second] = S;
108-
own(Copy, UniqueStrings, Arena);
92+
own(Copy, UniqueStrings);
10993
}
11094
}
11195

@@ -118,7 +102,7 @@ SymbolSlab SymbolSlab::Builder::build() && {
118102
BumpPtrAllocator NewArena;
119103
llvm::UniqueStringSaver Strings(NewArena);
120104
for (auto &S : Symbols)
121-
own(S, Strings, NewArena);
105+
own(S, Strings);
122106
return SymbolSlab(std::move(NewArena), std::move(Symbols));
123107
}
124108

0 commit comments

Comments
 (0)