2 * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
4 * Copyright (C) 1996 Andrew Tridgell
5 * Copyright (C) 1996 Paul Mackerras
6 * Copyright (C) 2004-2020 Wayne Davison
7 * Copyright (C) 2020 Jorrit Jongma
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 3 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, visit the http://fsf.org website.
23 * Optimization target for get_checksum1() was the Intel Atom D2700, the
24 * slowest CPU in the test set and the most likely to be CPU limited during
25 * transfers. The combination of intrinsics was chosen specifically for the
26 * most gain on that CPU, other combinations were occasionally slightly
27 * faster on the others.
29 * While on more modern CPUs transfers are less likely to be CPU limited
30 * (at least by this specific function), lower CPU usage is always better.
31 * Improvements may still be seen when matching chunks from NVMe storage
34 * Benchmarks (in MB/s) C SSE2 SSSE3 AVX2
35 * - Intel Atom D2700 550 750 1000 N/A
36 * - Intel i7-7700hq 1850 2550 4050 6200
37 * - AMD ThreadRipper 2950x 2900 5600 8950 8100
39 * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
40 * significantly faster. AVX2 is kept because it's more likely to relieve
41 * the bottleneck on the slower CPU.
43 * This optimization for get_checksum1() is intentionally limited to x86-64
44 * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
45 * the available xmm registers, this optimized version may not be faster than
46 * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
48 * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
49 * use of the target attribute, selecting the fastest code path based on
50 * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
51 * GCC 4.x are not supported to ease configure.ac logic.
61 #include <immintrin.h>
63 /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
67 #define MVSTATIC static
70 // Missing from the headers on gcc 6 and older, clang 8 and older
71 typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
72 typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
74 /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
75 These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
76 as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
77 performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
80 #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
81 #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
82 #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
83 #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
85 #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
86 #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
88 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
89 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
92 Original loop per 4 bytes:
93 s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
94 s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
96 SSE2/SSSE3 loop per 32 bytes:
99 for (int j = 0; j < 8; j++) {
100 t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
101 t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
103 s2 += 32*s1 + (uint32)(
104 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
105 t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
107 s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
110 __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
113 int aligned = ((uintptr_t)buf & 15) == 0;
117 __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
119 __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
121 const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
122 __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
124 for (; i < (len-32); i+=32) {
125 // Load ... 2*[int8*16]
126 __m128i in8_1, in8_2;
128 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
129 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
130 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
132 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
133 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
136 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
137 // Fastest, even though multiply by 1
138 __m128i mul_one = _mm_set1_epi8(1);
139 __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
140 __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
142 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
143 __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
144 __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
145 __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
148 ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
150 // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
151 // Shifting left, then shifting right again and shuffling (rather than just
152 // shifting right as with mul32 below) to cheaply end up with the correct sign
153 // extension as we go from int16 to int32.
154 __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
155 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
156 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
157 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
158 sum_add32 = _mm_srai_epi32(sum_add32, 16);
159 sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
161 // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
162 __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
163 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
164 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
165 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
166 sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
167 sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
169 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
170 ss1 = _mm_add_epi32(ss1, sum_add32);
172 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
173 ss2 = _mm_add_epi32(ss2, sum_mul_add32);
175 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
176 // We could've combined this with generating sum_add32 above and
177 // save an instruction but benchmarking shows that as being slower
178 __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
180 // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
181 __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
183 // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
184 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
185 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
187 // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
188 ss2 = _mm_add_epi32(ss2, mul32);
191 // s1 += 32*CHAR_OFFSET
192 __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
193 ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
195 // s2 += 528*CHAR_OFFSET
196 char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
197 ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
201 _mm_store_si128((__m128i_u*)x, ss1);
203 _mm_store_si128((__m128i_u*)x, ss2);
210 Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
211 For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
212 target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
213 to a near 50% performance drop.
215 __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
218 int aligned = ((uintptr_t)buf & 15) == 0;
222 __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
224 __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
226 const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
227 __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
229 for (; i < (len-32); i+=32) {
230 // Load ... 2*[int8*16]
231 __m128i in8_1, in8_2;
233 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
234 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
236 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
237 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
240 // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
241 // Fastest, even though multiply by 1
242 __m128i mul_one = _mm_set1_epi8(1);
243 __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
244 __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
246 // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
247 __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
248 \f__m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
249 __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
252 ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
254 // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
255 // Shifting left, then shifting right again and shuffling (rather than just
256 // shifting right as with mul32 below) to cheaply end up with the correct sign
257 // extension as we go from int16 to int32.
258 __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
259 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
260 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
261 sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
262 sum_add32 = _mm_srai_epi32(sum_add32, 16);
263 sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
265 // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
266 __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
267 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
268 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
269 sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
270 sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
271 sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
273 // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
274 ss1 = _mm_add_epi32(ss1, sum_add32);
276 // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
277 ss2 = _mm_add_epi32(ss2, sum_mul_add32);
279 // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
280 // We could've combined this with generating sum_add32 above and
281 // save an instruction but benchmarking shows that as being slower
282 __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
284 // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
285 __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
287 // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
288 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
289 mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
291 // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
292 ss2 = _mm_add_epi32(ss2, mul32);
295 // s1 += 32*CHAR_OFFSET
296 __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
297 ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
299 // s2 += 528*CHAR_OFFSET
300 char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
301 ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
305 _mm_store_si128((__m128i_u*)x, ss1);
307 _mm_store_si128((__m128i_u*)x, ss2);
313 extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2);
315 static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
319 for (; i < (len-4); i+=4) {
320 s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
321 s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
323 for (; i < len; i++) {
324 s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
331 /* With GCC 10 putting this implementation inside 'extern "C"' causes an
332 assembler error. That worked fine on GCC 5-9 and clang 6-10...
334 static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
340 // multiples of 64 bytes using AVX2 (if available)
341 i = get_checksum1_avx2((schar*)buf1, len, i, &s1, &s2);
343 // multiples of 32 bytes using SSSE3 (if available)
344 i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
346 // multiples of 32 bytes using SSE2 (if available)
347 i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
350 i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
352 return (s1 & 0xffff) + (s2 << 16);
357 uint32 get_checksum1(char *buf1, int32 len)
359 return get_checksum1_cpp(buf1, len);
364 #ifdef BENCHMARK_SIMD_CHECKSUM1
365 #pragma clang optimize off
366 #pragma GCC push_options
367 #pragma GCC optimize ("O0")
370 #define BLOCK_LEN 1024*1024
372 #ifndef CLOCK_MONOTONIC_RAW
373 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
376 static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
377 struct timespec start, end;
382 clock_gettime(CLOCK_MONOTONIC_RAW, &start);
383 for (i = 0; i < ROUNDS; i++) {
385 next = func((schar*)buf, len, 0, &s1, &s2);
386 get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
388 clock_gettime(CLOCK_MONOTONIC_RAW, &end);
389 us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
390 cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
391 printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
394 static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
395 uint32 cs = get_checksum1((char*)buf, len);
403 unsigned char* buf = (unsigned char*)aligned_alloc(64,BLOCK_LEN);
404 for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
406 benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
407 benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
408 benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
409 benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
410 benchmark("AVX2", get_checksum1_avx2, (schar*)buf, BLOCK_LEN);
416 #pragma GCC pop_options
417 #pragma clang optimize on
418 #endif /* BENCHMARK_SIMD_CHECKSUM1 */
420 #endif /* HAVE_SIMD */
421 #endif /* __cplusplus */
422 #endif /* __x86_64__ */