Skip to content

Commit 8ef460f

Browse files
committed
[llvm-reduce] Add parallel chunk processing.
This patch adds parallel processing of chunks. When reducing very large inputs, e.g. functions with 500k basic blocks, processing chunks in parallel can significantly speed up the reduction. To allow modifying clones of the original module in parallel, each clone needs their own LLVMContext object. To achieve this, each job parses the input module with their own LLVMContext. In case a job successfully reduced the input, it serializes the result module as bitcode into a result array. To ensure parallel reduction produces the same results as serial reduction, only the first successfully reduced result is used, and results of other successful jobs are dropped. Processing resumes after the chunk that was successfully reduced. The number of threads to use can be configured using the -j option. It defaults to 1, which means serial processing. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D113857
1 parent 6f82264 commit 8ef460f

File tree

3 files changed

+150
-9
lines changed

3 files changed

+150
-9
lines changed

llvm/test/tools/llvm-reduce/operands-skip.ll

+7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
; RUN: llvm-reduce %s -o %t --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
22
; RUN: FileCheck %s --input-file %t --check-prefixes=REDUCED
33

4+
; RUN: llvm-reduce -j 2 %s -o %t.1 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
5+
; RUN: FileCheck %s --input-file %t.1 --check-prefixes=REDUCED
6+
7+
; RUN: llvm-reduce -j 4 %s -o %t.2 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
8+
; RUN: FileCheck %s --input-file %t.2 --check-prefixes=REDUCED
9+
10+
411
; INTERESTING: store i32 43, i32* {{(%imm|%indirect)}}, align 4
512
; REDUCED: store i32 43, i32* %imm, align 4
613

llvm/tools/llvm-reduce/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
33
AllTargetsCodeGens
44
AllTargetsDescs
55
AllTargetsInfos
6+
BitReader
67
BitWriter
78
CodeGen
89
Core

llvm/tools/llvm-reduce/deltas/Delta.cpp

+142-9
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
#include "Delta.h"
1616
#include "ReducerWorkItem.h"
1717
#include "llvm/ADT/STLExtras.h"
18+
#include "llvm/Bitcode/BitcodeReader.h"
1819
#include "llvm/Bitcode/BitcodeWriter.h"
1920
#include "llvm/IR/Verifier.h"
2021
#include "llvm/Support/CommandLine.h"
22+
#include "llvm/Support/ThreadPool.h"
2123
#include "llvm/Support/ToolOutputFile.h"
2224
#include <fstream>
2325
#include <set>
@@ -37,6 +39,16 @@ static cl::opt<bool> TmpFilesAsBitcode(
3739
cl::desc("Write temporary files as bitcode, instead of textual IR"),
3840
cl::init(false));
3941

42+
#ifdef LLVM_ENABLE_THREADS
43+
static cl::opt<unsigned> NumJobs(
44+
"j",
45+
cl::desc("Maximum number of threads to use to process chunks. Set to 1 to "
46+
"disables parallelism."),
47+
cl::init(1));
48+
#else
49+
unsigned NumJobs = 1;
50+
#endif
51+
4052
void writeOutput(ReducerWorkItem &M, llvm::StringRef Message);
4153

4254
bool isReduced(ReducerWorkItem &M, TestRunner &Test,
@@ -120,7 +132,8 @@ static bool increaseGranularity(std::vector<Chunk> &Chunks) {
120132
// modified module if the chunk resulted in a reduction.
121133
template <typename T>
122134
static std::unique_ptr<ReducerWorkItem>
123-
CheckChunk(Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
135+
CheckChunk(Chunk &ChunkToCheckForUninterestingness,
136+
std::unique_ptr<ReducerWorkItem> Clone, TestRunner &Test,
124137
function_ref<void(Oracle &, T &)> ExtractChunksFromModule,
125138
std::set<Chunk> &UninterestingChunks,
126139
std::vector<Chunk> &ChunksStillConsideredInteresting) {
@@ -137,9 +150,6 @@ CheckChunk(Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
137150
C != ChunkToCheckForUninterestingness;
138151
});
139152

140-
// Clone module before hacking it up..
141-
std::unique_ptr<ReducerWorkItem> Clone =
142-
cloneReducerWorkItem(Test.getProgram());
143153
// Generate Module with only Targets inside Current Chunks
144154
Oracle O(CurrentChunks);
145155
ExtractChunksFromModule(O, *Clone);
@@ -169,6 +179,36 @@ CheckChunk(Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
169179
return Clone;
170180
}
171181

182+
template <typename T>
183+
SmallString<0> ProcessChunkFromSerializedBitcode(
184+
Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
185+
function_ref<void(Oracle &, T &)> ExtractChunksFromModule,
186+
std::set<Chunk> &UninterestingChunks,
187+
std::vector<Chunk> &ChunksStillConsideredInteresting,
188+
SmallString<0> &OriginalBC, std::atomic<bool> &AnyReduced) {
189+
LLVMContext Ctx;
190+
Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
191+
MemoryBufferRef(StringRef(OriginalBC.data(), OriginalBC.size()),
192+
"<llvm-reduce tmp module>"),
193+
Ctx);
194+
if (!MOrErr)
195+
report_fatal_error("Failed to read bitcode");
196+
auto CloneMMM = std::make_unique<ReducerWorkItem>();
197+
CloneMMM->M = std::move(MOrErr.get());
198+
199+
SmallString<0> Result;
200+
if (std::unique_ptr<ReducerWorkItem> ChunkResult =
201+
CheckChunk(ChunkToCheckForUninterestingness, std::move(CloneMMM),
202+
Test, ExtractChunksFromModule, UninterestingChunks,
203+
ChunksStillConsideredInteresting)) {
204+
raw_svector_ostream BCOS(Result);
205+
WriteBitcodeToFile(*ChunkResult->M, BCOS);
206+
// Communicate that the task reduced a chunk.
207+
AnyReduced = true;
208+
}
209+
return Result;
210+
}
211+
172212
/// Runs the Delta Debugging algorithm, splits the code into chunks and
173213
/// reduces the amount of chunks that are considered interesting by the
174214
/// given test.
@@ -207,19 +247,112 @@ void runDeltaPassInt(
207247
increaseGranularity(ChunksStillConsideredInteresting);
208248
}
209249

250+
std::atomic<bool> AnyReduced;
251+
std::unique_ptr<ThreadPool> ChunkThreadPoolPtr;
252+
if (NumJobs > 1)
253+
ChunkThreadPoolPtr =
254+
std::make_unique<ThreadPool>(hardware_concurrency(NumJobs));
255+
210256
bool FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity;
211257
do {
212258
FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = false;
213259

214260
std::set<Chunk> UninterestingChunks;
215-
for (Chunk &ChunkToCheckForUninterestingness :
216-
reverse(ChunksStillConsideredInteresting)) {
217-
std::unique_ptr<ReducerWorkItem> Result = CheckChunk(
218-
ChunkToCheckForUninterestingness, Test, ExtractChunksFromModule,
219-
UninterestingChunks, ChunksStillConsideredInteresting);
261+
262+
// When running with more than one thread, serialize the original bitcode
263+
// to OriginalBC.
264+
SmallString<0> OriginalBC;
265+
if (NumJobs > 1) {
266+
raw_svector_ostream BCOS(OriginalBC);
267+
WriteBitcodeToFile(*Test.getProgram().M, BCOS);
268+
}
269+
270+
std::deque<std::future<SmallString<0>>> TaskQueue;
271+
for (auto I = ChunksStillConsideredInteresting.rbegin(),
272+
E = ChunksStillConsideredInteresting.rend();
273+
I != E; ++I) {
274+
std::unique_ptr<ReducerWorkItem> Result = nullptr;
275+
unsigned WorkLeft = std::distance(I, E);
276+
277+
// Run in parallel mode, if the user requested more than one thread and
278+
// there are at least a few chunks to process.
279+
if (NumJobs > 1 && WorkLeft > 1) {
280+
unsigned NumInitialTasks = std::min(WorkLeft, unsigned(NumJobs));
281+
unsigned NumChunksProcessed = 0;
282+
283+
ThreadPool &ChunkThreadPool = *ChunkThreadPoolPtr;
284+
TaskQueue.clear();
285+
286+
AnyReduced = false;
287+
// Queue jobs to process NumInitialTasks chunks in parallel using
288+
// ChunkThreadPool. When the tasks are added to the pool, parse the
289+
// original module from OriginalBC with a fresh LLVMContext object. This
290+
// ensures that the cloned module of each task uses an independent
291+
// LLVMContext object. If a task reduces the input, serialize the result
292+
// back in the corresponding Result element.
293+
for (unsigned J = 0; J < NumInitialTasks; ++J) {
294+
TaskQueue.emplace_back(ChunkThreadPool.async(
295+
[J, I, &Test, &ExtractChunksFromModule, &UninterestingChunks,
296+
&ChunksStillConsideredInteresting, &OriginalBC, &AnyReduced]() {
297+
return ProcessChunkFromSerializedBitcode(
298+
*(I + J), Test, ExtractChunksFromModule,
299+
UninterestingChunks, ChunksStillConsideredInteresting,
300+
OriginalBC, AnyReduced);
301+
}));
302+
}
303+
304+
// Start processing results of the queued tasks. We wait for the first
305+
// task in the queue to finish. If it reduced a chunk, we parse the
306+
// result and exit the loop.
307+
// Otherwise we will try to schedule a new task, if
308+
// * no other pending job reduced a chunk and
309+
// * we have not reached the end of the chunk.
310+
while (!TaskQueue.empty()) {
311+
auto &Future = TaskQueue.front();
312+
Future.wait();
313+
314+
NumChunksProcessed++;
315+
SmallString<0> Res = Future.get();
316+
TaskQueue.pop_front();
317+
if (Res.empty()) {
318+
unsigned NumScheduledTasks = NumChunksProcessed + TaskQueue.size();
319+
if (!AnyReduced && I + NumScheduledTasks != E) {
320+
Chunk &ChunkToCheck = *(I + NumScheduledTasks);
321+
TaskQueue.emplace_back(ChunkThreadPool.async(
322+
[&Test, &ExtractChunksFromModule, &UninterestingChunks,
323+
&ChunksStillConsideredInteresting, &OriginalBC,
324+
&ChunkToCheck, &AnyReduced]() {
325+
return ProcessChunkFromSerializedBitcode(
326+
ChunkToCheck, Test, ExtractChunksFromModule,
327+
UninterestingChunks, ChunksStillConsideredInteresting,
328+
OriginalBC, AnyReduced);
329+
}));
330+
}
331+
continue;
332+
}
333+
334+
Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
335+
MemoryBufferRef(StringRef(Res.data(), Res.size()),
336+
"<llvm-reduce tmp module>"),
337+
Test.getProgram().M->getContext());
338+
if (!MOrErr)
339+
report_fatal_error("Failed to read bitcode");
340+
Result = std::make_unique<ReducerWorkItem>();
341+
Result->M = std::move(MOrErr.get());
342+
break;
343+
}
344+
// Forward I to the last chunk processed in parallel.
345+
I += NumChunksProcessed - 1;
346+
} else {
347+
Result = CheckChunk(*I, cloneReducerWorkItem(Test.getProgram()), Test,
348+
ExtractChunksFromModule, UninterestingChunks,
349+
ChunksStillConsideredInteresting);
350+
}
351+
220352
if (!Result)
221353
continue;
222354

355+
Chunk &ChunkToCheckForUninterestingness = *I;
223356
FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = true;
224357
UninterestingChunks.insert(ChunkToCheckForUninterestingness);
225358
ReducedProgram = std::move(Result);

0 commit comments

Comments
 (0)