RetroSearch Browse

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Showing content from https://github.com/Genivia/ugrep/issues/421 below:

[FR] Faster binary file checking with "ridiculously fast" SIMD UTF-8 validation · Issue #421 · Genivia/ugrep · GitHub

Ugrep checks if an input file is binary by checking if it has a zero byte or if it contains invalid UTF-8. Binary file checking is important, because the -I (--ignore-binary) uses it and also every search that has a match reports "Binary file XYZ matches".

To speed this up, "Ridiculously fast unicode (UTF-8) validation" comes to mind (sorry, not my choice of title), with details in this article. It uses table lookups in SSE3/SSE4.1/AVX and NEON/AArch64.

However, my dilemma is that I can't rely on the availability of SSE3, since the base ugrep version uses SSE2 for Intel CPUs, i.e. not SSE3 (or SSSE3 or higher) nor do I want to suddenly require SSE3 when it ran fast on SSE2 or have additional configure checks just for SSE3 to enable faster binary file checking (i.e. UTF-8 validation checking).

So I recently wrote a faster SIMD UTF-8 validation check that uses "bit bashing". It works with SSE2 just fine and runs fast too. Compared to the "ridiculously fast" article's table lookup method, my proposed method runs in SSE2 (table lookup does not, it requires SSE3). Also very nice is that the new method is faster than table lookup method for SSE2, SSE3 and AVX2:

UPDATE: updated with faster method that merges two tests into one to branch out of the loop.

SIMD algorithm time throughput table lookup method SSE4.1 compiled with -mavx2 -O2 174ms 5.7GB/s table lookup method SSE3 compiled with -msse3 -O2 205ms 4.9GB/s bitbash method SSE4.1 version compiled with -mavx2 -O2 161ms 6.2GB/s bitbash method SSE3 version compiled with -msse3 -O2 195ms 5.1GB/s bitbash method SSE2 version compiled with -msse2 -O2 204ms 4.9GB/s bitbash method AVX2 version compiled with -mavx2 -O2 (see next post) 117ms 8.5GB/s

Benchmark machine: MacOS 12.7.4 Intel quad core i7 2.9 GHz 16GB 2133 MHz LPDDR3
Benchmark file: enwik9 file size 1,000,000,000 mostly ASCII with some UTF-8

While both the table lookup and my proposed "bitbash" method both validate UTF-8, there is a small difference however. "Bitbash" does not flag surrogates and accepts some overlongs in the longer 3 and 4 byte UTF-8 sequences. That is fine for quickly validating UTF-8 to check if the input is binary or text. Surrogates and overlongs are practically never present in text files. The ugrep tool does not choke on them anyway.

The SSE2 implementation with commentary (note that we can't use palignr with SSE2 nor can we use _mm_shuffle_epi8 that the table lookup method uses):

bool isutf8(const char *s, const char *e)
{
  if (s <= e - 16)
  {
    const __m128i vxc0 = _mm_set1_epi8(0xc0);
    const __m128i vxc1 = _mm_set1_epi8(0xc1);
    const __m128i vxf5 = _mm_set1_epi8(0xf5);
    const __m128i v0 = _mm_setzero_si128();
    __m128i vp = v0;
    __m128i vq = v0;
    __m128i vr = v0;
    while (s <= e - 16)
    {
      // step 1: check valid signed byte ranges
      //   c = s[i]
      //   if (!(c > 0 || c < -64 || (c > -63 && c < -11)))
      //     return false
      //
      __m128i vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
      __m128i vt = _mm_and_si128(_mm_cmpgt_epi8(vc, vxc1), _mm_cmplt_epi8(vc, vxf5));
      vt = _mm_or_si128(vt, _mm_cmplt_epi8(vc, vxc0));
      vt = _mm_or_si128(vt, _mm_cmpgt_epi8(vc, v0));
      __m128i vm = vt;
      //   step 2: check UTF-8 multi-byte sequences of 2, 3 and 4 bytes long
      //     if (((-(c > -63) ^ (p | q | r)) & 0x80) != 0x80)
      //       return false
      //     r = (q & (q << 1))
      //     q = (p & (p << 1))
      //     p = (c & (c << 1))
      //
      //   possible values of c after step 1 and subsequent values of p, q, r:
      //     c at 1st byte   p at 2nd byte   q at 3rd byte   r at 4th byte
      //       0xxxxxxx        0xxxxxxx        0xxxxxxx        0xxxxxxx
      //       10xxxxxx        00xxxxxx        00xxxxxx        00xxxxxx
      //       110xxxxx        100xxxxx        000xxxxx        000xxxxx
      //       1110xxxx        1100xxxx        1000xxxx        0000xxxx
      //       11110xxx        11100xxx        11000xxx        10000xxx
      //
      //   byte vectors vc, vp, vq, vr and previous values:
      //                             | c | c | c | c | ... | c |
      //                     | old p | p | p | p | p | ... | p |
      //             | old q | old q | q | q | q | q | ... | q |
      //     | old r | old r | old r | r | r | r | r | ... | r |
      //
      //   shift vectors vp, vq, vr to align to compute bitwise-or vp | vq | vr -> vt:
      //                 |     c |     c |     c | c | ... | c | = vc
      //                 | old p |     p |     p | p | ... | p |
      //                 | old q | old q |     q | q | ... | q |
      //                 | old r | old r | old r | r | ... | r |
      //                   -----   -----   -----   -   ---   -   or
      //                 |     t |     t |     t | t | ... | t | = vt
      //
      //   SSE2 code to perform r = (q & (q << 1)); q = (p & (p << 1)); p = (c & (c << 1));
      //   shift parts of the old vp, vq, vr and new vp, vq, vr in vt using psrldq and por
      //   then check if ((-(c > -63) ^ (p | q | r))) bit 7 is 1
      //
      vt = _mm_bsrli_si128(vp, 15);
      vp = _mm_and_si128(vc, _mm_add_epi8(vc, vc));
      vt = _mm_or_si128(vt, _mm_bsrli_si128(vq, 14));
      vq = _mm_and_si128(vp, _mm_add_epi8(vp, vp));
      vt = _mm_or_si128(vt, _mm_bsrli_si128(vr, 13));
      vr = _mm_and_si128(vq, _mm_add_epi8(vq, vq));
      vt = _mm_or_si128(vt, _mm_bslli_si128(vp, 1));
      vt = _mm_or_si128(vt, _mm_bslli_si128(vq, 2));
      vt = _mm_or_si128(vt, _mm_bslli_si128(vr, 3));
      vt = _mm_xor_si128(vt, _mm_cmpgt_epi8(vc, vxc1));
      vm = _mm_and_si128(vm, vt);
      if (_mm_movemask_epi8(vm) != 0xffff)
        return false;
      s += 16;
    }
    // do not end in the middle of a UTF-8 multibyte sequence, backtrack when necessary (this will terminate)
    while ((*--s & 0xc0) == 0x80)
      continue;
  }
  // check remainder with scalar code

With SSE3/SSE4/AVX we can use _mm_alignr_epi8 and no longer need the _mm_bsrli_si128 and _mm_bslli_si128:

    [...]
    const __m128i vxc0 = _mm_set1_epi8(0xc0);
    const __m128i vxc1 = _mm_set1_epi8(0xc1);
    const __m128i vxf5 = _mm_set1_epi8(0xf5);
    const __m128i v0 = _mm_setzero_si128();
    __m128i vp = v0;
    __m128i vq = v0;
    __m128i vr = v0;
    while (s <= e - 16)
    {
      __m128i vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
      __m128i vt = _mm_and_si128(_mm_cmpgt_epi8(vc, vxc1), _mm_cmplt_epi8(vc, vxf5));
      vt = _mm_or_si128(vt, _mm_cmplt_epi8(vc, vxc0));
      vt = _mm_or_si128(vt, _mm_cmpgt_epi8(vc, v0));
      __m128i vm = vt;
      __m128i vo = vp;
      vp = _mm_and_si128(vc, _mm_add_epi8(vc, vc));
      vt = _mm_alignr_epi8(vp, vo, 15);
      vo = vq;
      vq = _mm_and_si128(vp, _mm_add_epi8(vp, vp));
      vt = _mm_or_si128(vt, _mm_alignr_epi8(vq, vo, 14));
      vo = vr;
      vr = _mm_and_si128(vq, _mm_add_epi8(vq, vq));
      vt = _mm_or_si128(vt, _mm_alignr_epi8(vr, vo, 13));
      vt = _mm_xor_si128(vt, _mm_cmpgt_epi8(vc, vxc1));
      vm = _mm_and_si128(vm, vt);
      if (_mm_movemask_epi8(vm) != 0xffff)
        return false;
      s += 16;
    }
    while ((*--s & 0xc0) == 0x80)
      continue;

The "bitbash" UTF-8 validation in ARM NEON, tested with Apple M1 Pro (AArch64):

SIMD algorithm time throughput bitbash method ARM NEON version compiled with -O2 116ms 8.6GB/s

  if (s <= e - 16)
  {
    const int8x16_t vxc0 = vdupq_n_s8(0xc0);
    const int8x16_t vxc1 = vdupq_n_s8(0xc1);
    const int8x16_t vxf5 = vdupq_n_s8(0xf5);
    const int8x16_t v0 = vdupq_n_s8(0);
    int8x16_t vp = v0;
    int8x16_t vq = v0;
    int8x16_t vr = v0;
    while (s <= e - 16)
    {
      int8x16_t vc = vld1q_s8(reinterpret_cast<const int8_t*>(s));
      int8x16_t vt = vandq_s8(vcgtq_s8(vc, vxc1), vcltq_s8(vc, vxf5));
      vt = vorrq_s8(vt, vcltq_s8(vc, vxc0));
      vt = vorrq_s8(vt, vcgtq_s8(vc, v0));
      int64x2_t vm = vreinterpretq_s64_s8(vt);
      int8x16_t vo = vp;
      vp = vandq_s8(vc, vshlq_n_s8(vc, 1));
      vt = vextq_s8(vo, vp, 15);
      vo = vq;
      vq = vandq_s8(vp, vshlq_n_s8(vp, 1));
      vt = vorrq_s8(vt, vextq_s8(vo, vq, 14));
      vo = vr;
      vr = vandq_s8(vq, vshlq_n_s8(vq, 1));
      vt = vorrq_s8(vt, vextq_s8(vo, vr, 13));
      vt = veorq_s8(vt, vcgtq_s8(vc, vxc1));
      vm = vandq_s64(vm, vreinterpretq_s64_s8(vt));
      if (((vgetq_lane_s64(vm, 0) & vgetq_lane_s64(vm, 1)) & 0x8080808080808080LL) != 0x8080808080808080LL)
        return false;;
      s += 16;
    }
    while ((*--s & 0xc0) == 0x80)
      continue;
  }

RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4