Skip to content

Commit 83f9b13

Browse files
committed
[libc] Optimized version of memmove
This implementation relies on storing data in registers for sizes up to 128B. Then depending on whether `dst` is less (resp. greater) than `src` we move data forward (resp. backward) by chunks of 32B. We first make sure one of the pointers is aligned to increase performance on large move sizes. Differential Revision: https://reviews.llvm.org/D114637
1 parent 508b3f4 commit 83f9b13

File tree

9 files changed

+357
-112
lines changed

9 files changed

+357
-112
lines changed

libc/src/__support/CPP/ArrayRef.h

+4
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ struct MutableArrayRef : public internal::ArrayRefBase<T> {
131131
public:
132132
// From Array.
133133
template <size_t N> MutableArrayRef(Array<T, N> &Arr) : Impl(Arr.Data, N) {}
134+
135+
operator ArrayRef<T>() const {
136+
return ArrayRef<T>(this->data(), this->size());
137+
}
134138
};
135139

136140
} // namespace cpp

libc/src/string/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,6 @@ function(add_memmove memmove_name)
409409
HDRS ${LIBC_SOURCE_DIR}/src/string/memmove.h
410410
DEPENDS
411411
.memory_utils.memory_utils
412-
.memory_utils.memcpy_implementation
413412
libc.include.string
414413
COMPILE_OPTIONS
415414
-fno-builtin

libc/src/string/memmove.cpp

+27-45
Original file line numberDiff line numberDiff line change
@@ -10,59 +10,41 @@
1010

1111
#include "src/__support/common.h"
1212
#include "src/__support/integer_operations.h"
13-
#include "src/string/memory_utils/memcpy_implementations.h"
13+
#include "src/string/memory_utils/elements.h"
1414
#include <stddef.h> // size_t, ptrdiff_t
1515

1616
namespace __llvm_libc {
1717

18-
static inline void move_byte_forward(char *dest_m, const char *src_m,
19-
size_t count) {
20-
for (size_t offset = 0; count; --count, ++offset)
21-
dest_m[offset] = src_m[offset];
22-
}
23-
24-
static inline void move_byte_backward(char *dest_m, const char *src_m,
25-
size_t count) {
26-
for (size_t offset = count - 1; count; --count, --offset)
27-
dest_m[offset] = src_m[offset];
18+
static inline void inline_memmove(char *dst, const char *src, size_t count) {
19+
using namespace __llvm_libc::scalar;
20+
if (count == 0)
21+
return;
22+
if (count == 1)
23+
return move<_1>(dst, src);
24+
if (count <= 4)
25+
return move<HeadTail<_2>>(dst, src, count);
26+
if (count <= 8)
27+
return move<HeadTail<_4>>(dst, src, count);
28+
if (count <= 16)
29+
return move<HeadTail<_8>>(dst, src, count);
30+
if (count <= 32)
31+
return move<HeadTail<_16>>(dst, src, count);
32+
if (count <= 64)
33+
return move<HeadTail<_32>>(dst, src, count);
34+
if (count <= 128)
35+
return move<HeadTail<_64>>(dst, src, count);
36+
37+
using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
38+
if (dst < src)
39+
return move<AlignedMoveLoop>(dst, src, count);
40+
else if (dst > src)
41+
return move_backward<AlignedMoveLoop>(dst, src, count);
2842
}
2943

3044
LLVM_LIBC_FUNCTION(void *, memmove,
3145
(void *dst, const void *src, size_t count)) {
32-
char *dest_c = reinterpret_cast<char *>(dst);
33-
const char *src_c = reinterpret_cast<const char *>(src);
34-
35-
// If the distance between `src_c` and `dest_c` is equal to or greater
36-
// than `count` (integerAbs(src_c - dest_c) >= count), they would not overlap.
37-
// e.g. greater equal overlapping
38-
// [12345678] [12345678] [12345678]
39-
// src_c: [_ab_____] [_ab_____] [_ab_____]
40-
// dest_c:[_____yz_] [___yz___] [__yz____]
41-
42-
// Call `memcpy` if `src_c` and `dest_c` do not overlap.
43-
if (__llvm_libc::integer_abs(src_c - dest_c) >=
44-
static_cast<ptrdiff_t>(count)) {
45-
inline_memcpy(dest_c, src_c, count);
46-
return dest_c;
47-
}
48-
49-
// Overlapping cases.
50-
// If `dest_c` starts before `src_c` (dest_c < src_c), copy
51-
// forward(pointer add 1) from beginning to end.
52-
// If `dest_c` starts after `src_c` (dest_c > src_c), copy
53-
// backward(pointer add -1) from end to beginning.
54-
// If `dest_c` and `src_c` start at the same address (dest_c == src_c),
55-
// just return dest.
56-
// e.g. forward backward
57-
// *-> <-*
58-
// src_c : [___abcde_] [_abcde___]
59-
// dest_c: [_abc--___] [___--cde_]
60-
61-
// TODO: Optimize `move_byte_xxx(...)` functions.
62-
if (dest_c < src_c)
63-
move_byte_forward(dest_c, src_c, count);
64-
if (dest_c > src_c)
65-
move_byte_backward(dest_c, src_c, count);
46+
inline_memmove(reinterpret_cast<char *>(dst),
47+
reinterpret_cast<const char *>(src), count);
6648
return dst;
6749
}
6850

libc/src/string/memory_utils/elements.h

+139-23
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ template <typename Element> void move(char *dst, const char *src) {
4343
template <typename Element> void move(char *dst, const char *src, size_t size) {
4444
Element::move(dst, src, size);
4545
}
46+
// Runtime-size move from 'src' to 'dst'.
47+
template <typename Element>
48+
void move_backward(char *dst, const char *src, size_t size) {
49+
Element::move_backward(dst, src, size);
50+
}
4651

4752
// Fixed-size equality between 'lhs' and 'rhs'.
4853
template <typename Element> bool equals(const char *lhs, const char *rhs) {
@@ -96,10 +101,8 @@ template <typename Element, size_t ElementCount> struct Repeated {
96101
}
97102

98103
static void move(char *dst, const char *src) {
99-
const auto value = Element::load(src);
100-
Repeated<Element, ElementCount - 1>::move(dst + Element::SIZE,
101-
src + Element::SIZE);
102-
Element::store(dst, value);
104+
const auto value = load(src);
105+
store(dst, value);
103106
}
104107

105108
static bool equals(const char *lhs, const char *rhs) {
@@ -341,6 +344,55 @@ template <typename T, typename TailT = T> struct Loop {
341344
Tail<TailT>::copy(dst, src, size);
342345
}
343346

347+
// Move forward suitable when dst < src. We load the tail bytes before
348+
// handling the loop.
349+
//
350+
// e.g. Moving two bytes
351+
// [ | | | | |]
352+
// [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
353+
// [_________________________LLLLLLLL___]
354+
// [___LLLLLLLL_________________________]
355+
// [_SSSSSSSS___________________________]
356+
// [___________LLLLLLLL_________________]
357+
// [_________SSSSSSSS___________________]
358+
// [___________________LLLLLLLL_________]
359+
// [_________________SSSSSSSS___________]
360+
// [_______________________SSSSSSSS_____]
361+
static void move(char *dst, const char *src, size_t size) {
362+
const size_t tail_offset = Tail<T>::offset(size);
363+
const auto tail_value = TailT::load(src + tail_offset);
364+
size_t offset = 0;
365+
do {
366+
T::move(dst + offset, src + offset);
367+
offset += T::SIZE;
368+
} while (offset < size - T::SIZE);
369+
TailT::store(dst + tail_offset, tail_value);
370+
}
371+
372+
// Move forward suitable when dst > src. We load the head bytes before
373+
// handling the loop.
374+
//
375+
// e.g. Moving two bytes
376+
// [ | | | | |]
377+
// [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
378+
// [___LLLLLLLL_________________________]
379+
// [_________________________LLLLLLLL___]
380+
// [___________________________SSSSSSSS_]
381+
// [_________________LLLLLLLL___________]
382+
// [___________________SSSSSSSS_________]
383+
// [_________LLLLLLLL___________________]
384+
// [___________SSSSSSSS_________________]
385+
// [_____SSSSSSSS_______________________]
386+
static void move_backward(char *dst, const char *src, size_t size) {
387+
const auto head_value = TailT::load(src);
388+
ptrdiff_t offset = size - T::SIZE;
389+
do {
390+
T::move(dst + offset, src + offset);
391+
offset -= T::SIZE;
392+
} while (offset >= 0);
393+
TailT::store(dst, head_value);
394+
}
395+
344396
static bool equals(const char *lhs, const char *rhs, size_t size) {
345397
size_t offset = 0;
346398
do {
@@ -375,30 +427,38 @@ enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
375427

376428
namespace internal {
377429

378-
// Provides a specialized bump function that adjusts pointers and size so first
379-
// argument (resp. second argument) gets aligned to Alignment.
380-
// We make sure the compiler knows about the adjusted pointer alignment.
381-
template <Arg arg, size_t Alignment> struct AlignHelper {};
430+
template <Arg arg> struct ArgSelector {};
382431

383-
template <size_t Alignment> struct AlignHelper<Arg::_1, Alignment> {
432+
template <> struct ArgSelector<Arg::_1> {
384433
template <typename T1, typename T2>
385-
static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) {
386-
const intptr_t offset = offset_to_next_aligned<Alignment>(p1ref);
387-
p1ref += offset;
388-
p2ref += offset;
389-
size -= offset;
390-
p1ref = assume_aligned<Alignment>(p1ref);
434+
static T1 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &p2ref) {
435+
return p1ref;
436+
}
437+
};
438+
439+
template <> struct ArgSelector<Arg::_2> {
440+
template <typename T1, typename T2>
441+
static T2 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &p2ref) {
442+
return p2ref;
391443
}
392444
};
393445

394-
template <size_t Alignment> struct AlignHelper<Arg::_2, Alignment> {
446+
// Provides a specialized bump function that adjusts pointers and size so first
447+
// argument (resp. second argument) gets aligned to Alignment.
448+
// We make sure the compiler knows about the adjusted pointer alignment.
449+
// The 'additional_bumps' parameter allows to reach previous / next aligned
450+
// pointers.
451+
template <Arg arg, size_t Alignment> struct Align {
395452
template <typename T1, typename T2>
396-
static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) {
397-
const intptr_t offset = offset_to_next_aligned<Alignment>(p2ref);
453+
static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size,
454+
int additional_bumps = 0) {
455+
auto &aligned_ptr = ArgSelector<arg>::Select(p1ref, p2ref);
456+
auto offset = offset_to_next_aligned<Alignment>(aligned_ptr);
457+
offset += additional_bumps * Alignment;
398458
p1ref += offset;
399459
p2ref += offset;
400460
size -= offset;
401-
p2ref = assume_aligned<Alignment>(p2ref);
461+
aligned_ptr = assume_aligned<Alignment>(aligned_ptr);
402462
}
403463
};
404464

@@ -423,29 +483,85 @@ template <typename AlignmentT, Arg AlignOn = Arg::_1> struct Align {
423483
static void copy(char *__restrict dst, const char *__restrict src,
424484
size_t size) {
425485
AlignmentT::copy(dst, src);
426-
internal::AlignHelper<AlignOn, ALIGNMENT>::bump(dst, src, size);
486+
internal::Align<AlignOn, ALIGNMENT>::bump(dst, src, size);
427487
NextT::copy(dst, src, size);
428488
}
429489

490+
// Move forward suitable when dst < src. The alignment is performed with an
491+
// HeadTail operation of size ∈ [Alignment, 2 x Alignment].
492+
//
493+
// e.g. Moving two bytes and making sure src is then aligned.
494+
// [ | | | | ]
495+
// [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
496+
// [____LLLLLLLL_____________________]
497+
// [___________LLLLLLLL______________]
498+
// [_SSSSSSSS________________________]
499+
// [________SSSSSSSS_________________]
500+
//
501+
// e.g. Moving two bytes and making sure dst is then aligned.
502+
// [ | | | | ]
503+
// [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
504+
// [____LLLLLLLL_____________________]
505+
// [______LLLLLLLL___________________]
506+
// [_SSSSSSSS________________________]
507+
// [___SSSSSSSS______________________]
508+
static void move(char *dst, const char *src, size_t size) {
509+
char *next_dst = dst;
510+
const char *next_src = src;
511+
size_t next_size = size;
512+
internal::Align<AlignOn, ALIGNMENT>::bump(next_dst, next_src, next_size,
513+
1);
514+
HeadTail<AlignmentT>::move(dst, src, size - next_size);
515+
NextT::move(next_dst, next_src, next_size);
516+
}
517+
518+
// Move backward suitable when dst > src. The alignment is performed with an
519+
// HeadTail operation of size ∈ [Alignment, 2 x Alignment].
520+
//
521+
// e.g. Moving two bytes backward and making sure src is then aligned.
522+
// [ | | | | ]
523+
// [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
524+
// [ _________________LLLLLLLL_______]
525+
// [ ___________________LLLLLLLL_____]
526+
// [____________________SSSSSSSS_____]
527+
// [______________________SSSSSSSS___]
528+
//
529+
// e.g. Moving two bytes and making sure dst is then aligned.
530+
// [ | | | | ]
531+
// [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
532+
// [ _______________LLLLLLLL_________]
533+
// [ ___________________LLLLLLLL_____]
534+
// [__________________SSSSSSSS_______]
535+
// [______________________SSSSSSSS___]
536+
static void move_backward(char *dst, const char *src, size_t size) {
537+
char *headtail_dst = dst + size;
538+
const char *headtail_src = src + size;
539+
size_t headtail_size = 0;
540+
internal::Align<AlignOn, ALIGNMENT>::bump(headtail_dst, headtail_src,
541+
headtail_size, -2);
542+
HeadTail<AlignmentT>::move(headtail_dst, headtail_src, headtail_size);
543+
NextT::move_backward(dst, src, size - headtail_size);
544+
}
545+
430546
static bool equals(const char *lhs, const char *rhs, size_t size) {
431547
if (!AlignmentT::equals(lhs, rhs))
432548
return false;
433-
internal::AlignHelper<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
549+
internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
434550
return NextT::equals(lhs, rhs, size);
435551
}
436552

437553
static int three_way_compare(const char *lhs, const char *rhs,
438554
size_t size) {
439555
if (!AlignmentT::equals(lhs, rhs))
440556
return AlignmentT::three_way_compare(lhs, rhs);
441-
internal::AlignHelper<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
557+
internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
442558
return NextT::three_way_compare(lhs, rhs, size);
443559
}
444560

445561
static void splat_set(char *dst, const unsigned char value, size_t size) {
446562
AlignmentT::splat_set(dst, value);
447563
char *dummy = nullptr;
448-
internal::AlignHelper<Arg::_1, ALIGNMENT>::bump(dst, dummy, size);
564+
internal::Align<Arg::_1, ALIGNMENT>::bump(dst, dummy, size);
449565
NextT::splat_set(dst, value, size);
450566
}
451567
};

libc/test/src/string/CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,8 @@ function(add_libc_multi_impl_test name)
261261
${LIBC_COMPILE_OPTIONS_NATIVE}
262262
${ARGN}
263263
)
264+
get_fq_target_name(${fq_config_name}_test fq_target_name)
265+
target_link_libraries(${fq_target_name} PRIVATE LibcMemoryHelpers)
264266
else()
265267
message(STATUS "Skipping test for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'")
266268
endif()

0 commit comments

Comments
 (0)