simd-checksum-x86_64.cpp

   1 /*
   2  * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
   3  *
   4  * Copyright (C) 1996 Andrew Tridgell
   5  * Copyright (C) 1996 Paul Mackerras
   6  * Copyright (C) 2004-2020 Wayne Davison
   7  * Copyright (C) 2020 Jorrit Jongma
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 3 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, visit the http://fsf.org website.
  21  */
  22 /*
  23  * Optimization target for get_checksum1() was the Intel Atom D2700, the
  24  * slowest CPU in the test set and the most likely to be CPU limited during
  25  * transfers. The combination of intrinsics was chosen specifically for the
  26  * most gain on that CPU, other combinations were occasionally slightly
  27  * faster on the others.
  28  *
  29  * While on more modern CPUs transfers are less likely to be CPU limited
  30  * (at least by this specific function), lower CPU usage is always better.
  31  * Improvements may still be seen when matching chunks from NVMe storage
  32  * even on newer CPUs.
  33  *
  34  * Benchmarks (in MB/s)            C    SSE2   SSSE3    AVX2
  35  * - Intel Atom D2700            550     750    1000     N/A
  36  * - Intel i7-7700hq            1850    2550    4050    6200
  37  * - AMD ThreadRipper 2950x     2900    5600    8950    8100
  38  *
  39  * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
  40  * significantly faster. AVX2 is kept because it's more likely to relieve
  41  * the bottleneck on the slower CPU.
  42  *
  43  * This optimization for get_checksum1() is intentionally limited to x86-64
  44  * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
  45  * the available xmm registers, this optimized version may not be faster than
  46  * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
  47  *
  48  * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
  49  * use of the target attribute, selecting the fastest code path based on
  50  * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
  51  * GCC 4.x are not supported to ease configure.ac logic.
  52  */
  53
  54 #ifdef __x86_64__
  55 #ifdef __cplusplus
  56
  57 #include "rsync.h"
  58
  59 #ifdef HAVE_SIMD
  60
  61 #include <immintrin.h>
  62
  63 /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
  64 #ifdef __clang__
  65 #define MVSTATIC
  66 #else
  67 #define MVSTATIC static
  68 #endif
  69
  70 // Missing from the headers on gcc 6 and older, clang 8 and older
  71 typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  72 typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
  73
  74 /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
  75    These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
  76    as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
  77    performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
  78 */
  79
  80 #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
  81 #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
  82 #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
  83 #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
  84
  85 #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
  86 #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
  87
  88 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  89 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  90
  91 /*
  92   Original loop per 4 bytes:
  93     s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
  94     s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
  95
  96   SSE2/SSSE3 loop per 32 bytes:
  97     int16 t1[8];
  98     int16 t2[8];
  99     for (int j = 0; j < 8; j++) {
 100       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
 101       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
 102     }
 103     s2 += 32*s1 + (uint32)(
 104               28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
 105               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 106           ) + 528*CHAR_OFFSET;
 107     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
 108           32*CHAR_OFFSET;
 109  */
 110 __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 111 {
 112     if (len > 32) {
 113         int aligned = ((uintptr_t)buf & 15) == 0;
 114
 115         uint32 x[4] = {0};
 116         x[0] = *ps1;
 117         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
 118         x[0] = *ps2;
 119         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
 120
 121         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
 122         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
 123
 124         for (; i < (len-32); i+=32) {
 125             // Load ... 2*[int8*16]
 126             __m128i in8_1, in8_2;
 127             if (!aligned) {
 128                 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
 129                 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
 130                 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
 131             } else {
 132                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
 133                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
 134             }
 135
 136             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 137             // Fastest, even though multiply by 1
 138             __m128i mul_one = _mm_set1_epi8(1);
 139             __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
 140             __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
 141
 142             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 143             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 144             __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
 145             __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
 146
 147             // s2 += 32*s1
 148             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
 149
 150             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 151             // Shifting left, then shifting right again and shuffling (rather than just
 152             // shifting right as with mul32 below) to cheaply end up with the correct sign
 153             // extension as we go from int16 to int32.
 154             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
 155             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
 156             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
 157             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
 158             sum_add32 = _mm_srai_epi32(sum_add32, 16);
 159             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
 160
 161             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 162             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
 163             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
 164             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
 165             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
 166             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
 167             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
 168
 169             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 170             ss1 = _mm_add_epi32(ss1, sum_add32);
 171
 172             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 173             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
 174
 175             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 176             // We could've combined this with generating sum_add32 above and
 177             // save an instruction but benchmarking shows that as being slower
 178             __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
 179
 180             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 181             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
 182
 183             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 184             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
 185             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
 186
 187             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 188             ss2 = _mm_add_epi32(ss2, mul32);
 189
 190 #if CHAR_OFFSET != 0
 191             // s1 += 32*CHAR_OFFSET
 192             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 193             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 194
 195             // s2 += 528*CHAR_OFFSET
 196             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 197             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 198 #endif
 199         }
 200
 201         _mm_store_si128((__m128i_u*)x, ss1);
 202         *ps1 = x[0];
 203         _mm_store_si128((__m128i_u*)x, ss2);
 204         *ps2 = x[0];
 205     }
 206     return i;
 207 }
 208
 209 /*
 210   Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
 211   For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
 212   target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
 213   to a near 50% performance drop.
 214  */
 215 __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 216 {
 217     if (len > 32) {
 218         int aligned = ((uintptr_t)buf & 15) == 0;
 219
 220         uint32 x[4] = {0};
 221         x[0] = *ps1;
 222         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
 223         x[0] = *ps2;
 224         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
 225
 226         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
 227         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
 228
 229         for (; i < (len-32); i+=32) {
 230             // Load ... 2*[int8*16]
 231             __m128i in8_1, in8_2;
 232             if (!aligned) {
 233                 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
 234                 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
 235             } else {
 236                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
 237                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
 238             }
 239
 240             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
 241             // Fastest, even though multiply by 1
 242             __m128i mul_one = _mm_set1_epi8(1);
 243             __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
 244             __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
 245
 246             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
 247             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
 248             \f__m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
 249             __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
 250
 251             // s2 += 32*s1
 252             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
 253
 254             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 255             // Shifting left, then shifting right again and shuffling (rather than just
 256             // shifting right as with mul32 below) to cheaply end up with the correct sign
 257             // extension as we go from int16 to int32.
 258             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
 259             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
 260             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
 261             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
 262             sum_add32 = _mm_srai_epi32(sum_add32, 16);
 263             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
 264
 265             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
 266             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
 267             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
 268             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
 269             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
 270             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
 271             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
 272
 273             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
 274             ss1 = _mm_add_epi32(ss1, sum_add32);
 275
 276             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
 277             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
 278
 279             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
 280             // We could've combined this with generating sum_add32 above and
 281             // save an instruction but benchmarking shows that as being slower
 282             __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
 283
 284             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
 285             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
 286
 287             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
 288             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
 289             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
 290
 291             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
 292             ss2 = _mm_add_epi32(ss2, mul32);
 293
 294 #if CHAR_OFFSET != 0
 295             // s1 += 32*CHAR_OFFSET
 296             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
 297             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
 298
 299             // s2 += 528*CHAR_OFFSET
 300             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
 301             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
 302 #endif
 303         }
 304
 305         _mm_store_si128((__m128i_u*)x, ss1);
 306         *ps1 = x[0];
 307         _mm_store_si128((__m128i_u*)x, ss2);
 308         *ps2 = x[0];
 309     }
 310     return i;
 311 }
 312
 313 extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2);
 314
 315 static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 316 {
 317     uint32 s1 = *ps1;
 318     uint32 s2 = *ps2;
 319     for (; i < (len-4); i+=4) {
 320         s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
 321         s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
 322     }
 323     for (; i < len; i++) {
 324         s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
 325     }
 326     *ps1 = s1;
 327     *ps2 = s2;
 328     return i;
 329 }
 330
 331 /* With GCC 10 putting this implementation inside 'extern "C"' causes an
 332    assembler error. That worked fine on GCC 5-9 and clang 6-10...
 333   */
 334 static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
 335 {
 336     int32 i = 0;
 337     uint32 s1 = 0;
 338     uint32 s2 = 0;
 339
 340     // multiples of 64 bytes using AVX2 (if available)
 341     i = get_checksum1_avx2((schar*)buf1, len, i, &s1, &s2);
 342
 343     // multiples of 32 bytes using SSSE3 (if available)
 344     i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
 345
 346     // multiples of 32 bytes using SSE2 (if available)
 347     i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
 348
 349     // whatever is left
 350     i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
 351
 352     return (s1 & 0xffff) + (s2 << 16);
 353 }
 354
 355 extern "C" {
 356
 357 uint32 get_checksum1(char *buf1, int32 len)
 358 {
 359     return get_checksum1_cpp(buf1, len);
 360 }
 361
 362 } // extern "C"
 363
 364 #ifdef BENCHMARK_SIMD_CHECKSUM1
 365 #pragma clang optimize off
 366 #pragma GCC push_options
 367 #pragma GCC optimize ("O0")
 368
 369 #define ROUNDS 1024
 370 #define BLOCK_LEN 1024*1024
 371
 372 #ifndef CLOCK_MONOTONIC_RAW
 373 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
 374 #endif
 375
 376 static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
 377     struct timespec start, end;
 378     uint64_t us;
 379     uint32_t cs, s1, s2;
 380     int i, next;
 381
 382     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 383     for (i = 0; i < ROUNDS; i++) {
 384         s1 = s2 = 0;
 385         next = func((schar*)buf, len, 0, &s1, &s2);
 386         get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
 387     }
 388     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
 389     us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
 390     cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
 391     printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
 392 }
 393
 394 static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
 395     uint32 cs = get_checksum1((char*)buf, len);
 396     *ps1 = cs & 0xffff;
 397     *ps2 = cs >> 16;
 398     return len;
 399 }
 400
 401 int main() {
 402     int i;
 403     unsigned char* buf = (unsigned char*)aligned_alloc(64,BLOCK_LEN);
 404     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
 405
 406     benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
 407     benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
 408     benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
 409     benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
 410     benchmark("AVX2", get_checksum1_avx2, (schar*)buf, BLOCK_LEN);
 411
 412     free(buf);
 413     return 0;
 414 }
 415
 416 #pragma GCC pop_options
 417 #pragma clang optimize on
 418 #endif /* BENCHMARK_SIMD_CHECKSUM1 */
 419
 420 #endif /* HAVE_SIMD */
 421 #endif /* __cplusplus */
 422 #endif /* __x86_64__ */