A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/bmavx2_8h_source.html below:

NCBI C++ ToolKit: include/util/bitset/bmavx2.h Source File

1 #ifndef BMAVX2__H__INCLUDED__ 2 #define BMAVX2__H__INCLUDED__ 79 void

avx2_print256_u32(

const char

* prefix,

const

__m256i &

value

)

81  const size_t n

=

sizeof

(__m256i) /

sizeof

(

unsigned

);

84

std::cout << prefix <<

" [ "

;

85  for

(

int i

=

n

-1; 1; --

i

)

91

std::cout <<

"]"

<< std::endl;

95 void

avx2_print256_u16(

const char

* prefix,

const

__m256i &

value

)

97  const size_t n

=

sizeof

(__m256i) /

sizeof

(

unsigned short

);

100

std::cout << prefix <<

" [ "

;

101  for

(

int i

=

n

-1; 1; --

i

)

103

std::cout <<

buffer

[

i

] <<

" "

;

107

std::cout <<

"]"

<< std::endl;

112 #pragma GCC diagnostic push 113 #pragma GCC diagnostic ignored "-Wconversion" 117 #define BM_CSA256(h, l, a, b, c) \ 119  __m256i u = _mm256_xor_si256(a, b); \ 120  h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \ 121  l = _mm256_xor_si256(u, c); \ 124 #define BM_AVX2_BIT_COUNT(ret, v) \ 126  __m256i lo = _mm256_and_si256(v, low_mask); \ 127  __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \ 128  __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \ 129  __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \ 130  ret = _mm256_sad_epu8(cnt1, cnt2); \ 133 #define BM_AVX2_DECL_LOOKUP1 \ 134  __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \ 135  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8); 136 #define BM_AVX2_DECL_LOOKUP2 \ 137 __m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \ 138  4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); 140 #define BM_AVX2_POPCNT_PROLOG \ 141  BM_AVX2_DECL_LOOKUP1 \ 142  BM_AVX2_DECL_LOOKUP2 \ 143  __m256i low_mask = _mm256_set1_epi8(0x0f); \ 159

__m256i

cnt

= _mm256_setzero_si256();

160

__m256i ones = _mm256_setzero_si256();

161

__m256i twos = _mm256_setzero_si256();

162

__m256i fours = _mm256_setzero_si256();

163

__m256i eights = _mm256_setzero_si256();

164

__m256i sixteens = _mm256_setzero_si256();

165

__m256i twosA, twosB, foursA, foursB, eightsA, eightsB;

173  b

= _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);

176  b

= _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);

178  BM_CSA256

(foursA, twos, twos, twosA, twosB);

180  b

= _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);

183  b

= _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);

185  BM_CSA256

(foursB, twos, twos, twosA, twosB);

186  BM_CSA256

(eightsA, fours, fours, foursA, foursB);

188  b

= _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);

191  b

= _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);

193  BM_CSA256

(foursA, twos, twos, twosA, twosB);

195  b

= _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);

198  b

= _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);

200  BM_CSA256

(foursB, twos, twos, twosA, twosB);

201  BM_CSA256

(eightsB, fours, fours, foursA, foursB);

202  BM_CSA256

(sixteens, eights, eights, eightsA, eightsB);

205  cnt

= _mm256_add_epi64(

cnt

, bc);

208

}

while

(block < block_end);

210  cnt

= _mm256_slli_epi64(

cnt

, 4);

212  cnt

= _mm256_add_epi64(

cnt

, _mm256_slli_epi64(bc, 3));

214  cnt

= _mm256_add_epi64(

cnt

, _mm256_slli_epi64(bc, 2));

216  cnt

= _mm256_add_epi64(

cnt

, _mm256_slli_epi64(bc, 1));

218  cnt

= _mm256_add_epi64(

cnt

, bc);

222  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

238

__m256i

cnt

= _mm256_setzero_si256();

246  const

__m256i*

BMRESTRICT

wave_src = (__m256i*)&block[off];

248

__m256i m1A, m1B, m1C, m1D;

249

m1A = _mm256_load_si256(wave_src);

250

m1B = _mm256_load_si256(wave_src+1);

251  if

(!_mm256_testz_si256(m1A, m1A))

254  cnt

= _mm256_add_epi64(

cnt

, bc);

256  if

(!_mm256_testz_si256(m1B, m1B))

259  cnt

= _mm256_add_epi64(

cnt

, bc);

262

m1C = _mm256_load_si256(wave_src+2);

263

m1D = _mm256_load_si256(wave_src+3);

264  if

(!_mm256_testz_si256(m1C, m1C))

267  cnt

= _mm256_add_epi64(

cnt

, bc);

269  if

(!_mm256_testz_si256(m1D, m1D))

272  cnt

= _mm256_add_epi64(

cnt

, bc);

278  count

= (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

296

__m256i

cnt

= _mm256_setzero_si256();

302

ymm0 = _mm256_load_si256(block);

303

ymm1 = _mm256_load_si256(mask_block);

304

ymm0 = _mm256_and_si256(ymm0, ymm1);

305

++block; ++mask_block;

307  cnt

= _mm256_add_epi64(

cnt

, bc);

309

ymm0 = _mm256_load_si256(block);

310

ymm1 = _mm256_load_si256(mask_block);

311

ymm0 = _mm256_and_si256(ymm0, ymm1);

312

++block; ++mask_block;

314  cnt

= _mm256_add_epi64(

cnt

, bc);

316

ymm0 = _mm256_load_si256(block);

317

ymm1 = _mm256_load_si256(mask_block);

318

ymm0 = _mm256_and_si256(ymm0, ymm1);

319

++block; ++mask_block;

321  cnt

= _mm256_add_epi64(

cnt

, bc);

323

ymm0 = _mm256_load_si256(block);

324

ymm1 = _mm256_load_si256(mask_block);

325

ymm0 = _mm256_and_si256(ymm0, ymm1);

326

++block; ++mask_block;

328  cnt

= _mm256_add_epi64(

cnt

, bc);

330

}

while

(block < block_end);

333  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

343

__m256i

cnt

= _mm256_setzero_si256();

346

__m256i

tmp0

= _mm256_load_si256(block);

347

__m256i

tmp1

= _mm256_load_si256(mask_block);

352  cnt

= _mm256_add_epi64(

cnt

, bc);

354

++block; ++mask_block;

356

}

while

(block < block_end);

359  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

374

__m256i

cnt

= _mm256_setzero_si256();

375

__m256i mA, mB, mC, mD;

378

mA = _mm256_xor_si256(_mm256_load_si256(block+0),

379

_mm256_load_si256(mask_block+0));

381  cnt

= _mm256_add_epi64(

cnt

, bc);

383

mB = _mm256_xor_si256(_mm256_load_si256(block+1),

384

_mm256_load_si256(mask_block+1));

386  cnt

= _mm256_add_epi64(

cnt

, bc);

388

mC = _mm256_xor_si256(_mm256_load_si256(block+2),

389

_mm256_load_si256(mask_block+2));

391  cnt

= _mm256_add_epi64(

cnt

, bc);

393

mD = _mm256_xor_si256(_mm256_load_si256(block+3),

394

_mm256_load_si256(mask_block+3));

396  cnt

= _mm256_add_epi64(

cnt

, bc);

398

block += 4; mask_block += 4;

400

}

while

(block < block_end);

403  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

419

__m256i

cnt

= _mm256_setzero_si256();

422

__m256i

tmp0

= _mm256_load_si256(block);

423

__m256i

tmp1

= _mm256_load_si256(mask_block);

428  cnt

= _mm256_add_epi64(

cnt

, bc);

430

++block; ++mask_block;

432

}

while

(block < block_end);

435  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

452

__m256i yM = _mm256_set1_epi32(

int

(

mask

));

455

_mm256_store_si256(dst+0, _mm256_xor_si256(_mm256_load_si256(src+0), yM));

456

_mm256_store_si256(dst+1, _mm256_xor_si256(_mm256_load_si256(src+1), yM));

457

_mm256_store_si256(dst+2, _mm256_xor_si256(_mm256_load_si256(src+2), yM));

458

_mm256_store_si256(dst+3, _mm256_xor_si256(_mm256_load_si256(src+3), yM));

461

}

while

(src < src_end);

477

__m256i yM = _mm256_set1_epi32(

int

(

mask

));

480

_mm256_store_si256(dst+0, _mm256_andnot_si256(_mm256_load_si256(src+0), yM));

481

_mm256_store_si256(dst+1, _mm256_andnot_si256(_mm256_load_si256(src+1), yM));

482

_mm256_store_si256(dst+2, _mm256_andnot_si256(_mm256_load_si256(src+2), yM));

483

_mm256_store_si256(dst+3, _mm256_andnot_si256(_mm256_load_si256(src+3), yM));

486

}

while

(src < src_end);

499

__m256i m1A, m1B, m1C, m1D;

500

__m256i accA, accB, accC, accD;

505

accA = accB = accC = accD = _mm256_setzero_si256();

509

m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));

510

m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));

511

m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));

512

m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));

514

_mm256_store_si256(dst+0, m1A);

515

_mm256_store_si256(dst+1, m1B);

516

_mm256_store_si256(dst+2, m1C);

517

_mm256_store_si256(dst+3, m1D);

519

accA = _mm256_or_si256(accA, m1A);

520

accB = _mm256_or_si256(accB, m1B);

521

accC = _mm256_or_si256(accC, m1C);

522

accD = _mm256_or_si256(accD, m1D);

526

}

while

(src < src_end);

528

accA = _mm256_or_si256(accA, accB);

529

accC = _mm256_or_si256(accC, accD);

530

accA = _mm256_or_si256(accA, accC);

532  return

!_mm256_testz_si256(accA, accA);

546

__m256i m1A, m1B, m1C, m1D;

548

m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));

549

m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));

550

m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));

551

m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));

553

_mm256_store_si256(dst+0, m1A);

554

_mm256_store_si256(dst+1, m1B);

555

_mm256_store_si256(dst+2, m1C);

556

_mm256_store_si256(dst+3, m1D);

558

m1A = _mm256_or_si256(m1A, m1B);

559

m1C = _mm256_or_si256(m1C, m1D);

560

m1A = _mm256_or_si256(m1A, m1C);

562  return

_mm256_testz_si256(m1A, m1A);

577

__m256i m1A, m1B, m1C, m1D;

579

m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));

580

m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));

581

m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));

582

m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));

584

_mm256_store_si256(dst+0, m1A);

585

_mm256_store_si256(dst+1, m1B);

586

_mm256_store_si256(dst+2, m1C);

587

_mm256_store_si256(dst+3, m1D);

589

m1A = _mm256_or_si256(m1A, m1B);

590

m1C = _mm256_or_si256(m1C, m1D);

591

m1A = _mm256_or_si256(m1A, m1C);

593  return

_mm256_testz_si256(m1A, m1A);

608  const

__m256i maskF = _mm256_set1_epi32(~0u);

610

__m256i m1A, m1B, m1C, m1D;

612

__m256i mSA, mSB, mSC, mSD;

615

mSA = _mm256_load_si256(dst+0);

616

mSB = _mm256_load_si256(dst+1);

617

mACC1 = _mm256_and_si256(mSA, mSB);

619

mSC = _mm256_load_si256(dst+2);

620

mSD = _mm256_load_si256(dst+3);

622

mACC1 = _mm256_and_si256(mACC1, _mm256_and_si256(mSC, mSD));

624

mACC1 = _mm256_xor_si256(mACC1, maskF);

625  if

(_mm256_testz_si256(mACC1, mACC1))

629

m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));

630

m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));

631

m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));

632

m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));

635

_mm256_or_si256(_mm256_or_si256(m1A, m1B), _mm256_or_si256(m1C, m1D));

636  bool

all_z = _mm256_testz_si256(mACC1, mACC1);

640

m1A = _mm256_or_si256(mSA, m1A);

641

m1B = _mm256_or_si256(mSB, m1B);

642

m1C = _mm256_or_si256(mSC, m1C);

643

m1D = _mm256_or_si256(mSD, m1D);

645

_mm256_store_si256(dst+0, m1A);

646

_mm256_store_si256(dst+1, m1B);

647

_mm256_store_si256(dst+2, m1C);

648

_mm256_store_si256(dst+3, m1D);

665

__m256i m1A, m1B, m1C, m1D;

666

__m256i m1E, m1F, m1G, m1H;

669

__m256i s1_0, s2_0, s1_1, s2_1;

671

s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);

672

s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);

673

m1A = _mm256_and_si256(s1_0, s2_0);

674

m1B = _mm256_and_si256(s1_1, s2_1);

675

s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);

676

s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);

677

m1C = _mm256_and_si256(s1_0, s2_0);

678

m1D = _mm256_and_si256(s1_1, s2_1);

681

__m256i s3_0, s4_0, s3_1, s4_1;

683

s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);

684

s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);

685

m1E = _mm256_and_si256(s3_0, s4_0);

686

m1F = _mm256_and_si256(s3_1, s4_1);

688

m1A = _mm256_and_si256(m1A, m1E);

689

m1B = _mm256_and_si256(m1B, m1F);

691

s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);

692

s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);

693

m1G = _mm256_and_si256(s3_0, s4_0);

694

m1H = _mm256_and_si256(s3_1, s4_1);

698

dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);

700

m1C = _mm256_and_si256(m1C, m1G);

701

m1D = _mm256_and_si256(m1D, m1H);

702

m1A = _mm256_and_si256(m1A, dst0);

703

m1B = _mm256_and_si256(m1B, dst1);

705

dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);

707

m1C = _mm256_and_si256(m1C, dst0);

708

m1D = _mm256_and_si256(m1D, dst1);

710

_mm256_store_si256(dst + 0, m1A);

711

_mm256_store_si256(dst + 1, m1B);

712

_mm256_store_si256(dst + 2, m1C);

713

_mm256_store_si256(dst + 3, m1D);

715

m1A = _mm256_or_si256(m1A, m1B);

716

m1C = _mm256_or_si256(m1C, m1D);

717

m1A = _mm256_or_si256(m1A, m1C);

719  return

_mm256_testz_si256(m1A, m1A);

731

__m256i m1A, m1B, m1C, m1D;

734

__m256i s1_0, s2_0, s1_1, s2_1;

736

s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);

737

s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);

738

m1A = _mm256_and_si256(s1_0, s2_0);

739

m1B = _mm256_and_si256(s1_1, s2_1);

740

s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);

741

s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);

742

m1C = _mm256_and_si256(s1_0, s2_0);

743

m1D = _mm256_and_si256(s1_1, s2_1);

747

dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);

749

m1A = _mm256_and_si256(m1A, dst0);

750

m1B = _mm256_and_si256(m1B, dst1);

752

dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);

754

m1C = _mm256_and_si256(m1C, dst0);

755

m1D = _mm256_and_si256(m1D, dst1);

757

_mm256_store_si256(dst + 0, m1A);

758

_mm256_store_si256(dst + 1, m1B);

759

_mm256_store_si256(dst + 2, m1C);

760

_mm256_store_si256(dst + 3, m1D);

762

m1A = _mm256_or_si256(m1A, m1B);

763

m1C = _mm256_or_si256(m1C, m1D);

764

m1A = _mm256_or_si256(m1A, m1C);

766  return

_mm256_testz_si256(m1A, m1A);

781

__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;

782

__m256i accA, accB, accC, accD;

784

accA = _mm256_setzero_si256();

785

accB = _mm256_setzero_si256();

786

accC = _mm256_setzero_si256();

787

accD = _mm256_setzero_si256();

791

m1A = _mm256_loadu_si256(src+0);

792

m2A = _mm256_load_si256(dst+0);

793

m1A = _mm256_and_si256(m1A, m2A);

794

_mm256_store_si256(dst+0, m1A);

795

accA = _mm256_or_si256(accA, m1A);

797

m1B = _mm256_loadu_si256(src+1);

798

m2B = _mm256_load_si256(dst+1);

799

m1B = _mm256_and_si256(m1B, m2B);

800

_mm256_store_si256(dst+1, m1B);

801

accB = _mm256_or_si256(accB, m1B);

803

m1C = _mm256_loadu_si256(src+2);

804

m2C = _mm256_load_si256(dst+2);

805

m1C = _mm256_and_si256(m1C, m2C);

806

_mm256_store_si256(dst+2, m1C);

807

accC = _mm256_or_si256(accC, m1C);

809

m1D = _mm256_loadu_si256(src+3);

810

m2D = _mm256_load_si256(dst+3);

811

m1D = _mm256_and_si256(m1D, m2D);

812

_mm256_store_si256(dst+3, m1D);

813

accD = _mm256_or_si256(accD, m1D);

817

}

while

(src < src_end);

819

accA = _mm256_or_si256(accA, accB);

820

accC = _mm256_or_si256(accC, accD);

821

accA = _mm256_or_si256(accA, accC);

823  return

!_mm256_testz_si256(accA, accA);

838

__m256i m1A, m1B, m1C, m1D;

840

__m256i mAccF0 = _mm256_set1_epi32(~0u);

841

__m256i mAccF1 = _mm256_set1_epi32(~0u);

851

m1A = _mm256_or_si256(_mm256_load_si256(src), _mm256_load_si256(dst));

852

m1B = _mm256_or_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));

853

mAccF0 = _mm256_and_si256(mAccF0, m1A);

854

mAccF0 = _mm256_and_si256(mAccF0, m1B);

856

_mm256_stream_si256(dst, m1A);

857

_mm256_stream_si256(dst+1, m1B);

861

m1C = _mm256_or_si256(_mm256_load_si256(src2), _mm256_load_si256(dst2));

862

m1D = _mm256_or_si256(_mm256_load_si256(src2+1), _mm256_load_si256(dst2+1));

863

mAccF1 = _mm256_and_si256(mAccF1, m1C);

864

mAccF1 = _mm256_and_si256(mAccF1, m1D);

866

_mm256_stream_si256(dst2, m1C);

867

_mm256_stream_si256(dst2+1, m1D);

869

src2 += 2; dst2 += 2;

870

}

while

(src2 < src_end);

872

__m256i maskF = _mm256_set1_epi32(~0u);

873

mAccF0 = _mm256_and_si256(mAccF0, mAccF1);

874

__m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);

875  unsigned

maskA = unsigned(_mm256_movemask_epi8(wcmpA));

876  return

(maskA == ~0u);

892

__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;

893

__m256i mAccF0 = _mm256_set1_epi32(~0u);

894

__m256i mAccF1 = _mm256_set1_epi32(~0u);

897

m1A = _mm256_loadu_si256(src+0);

898

m2A = _mm256_load_si256(dst+0);

899

m1A = _mm256_or_si256(m1A, m2A);

900

_mm256_store_si256(dst+0, m1A);

902

m1B = _mm256_loadu_si256(src+1);

903

m2B = _mm256_load_si256(dst+1);

904

m1B = _mm256_or_si256(m1B, m2B);

905

_mm256_store_si256(dst+1, m1B);

907

m1C = _mm256_loadu_si256(src+2);

908

m2C = _mm256_load_si256(dst+2);

909

m1C = _mm256_or_si256(m1C, m2C);

910

_mm256_store_si256(dst+2, m1C);

912

m1D = _mm256_loadu_si256(src+3);

913

m2D = _mm256_load_si256(dst+3);

914

m1D = _mm256_or_si256(m1D, m2D);

915

_mm256_store_si256(dst+3, m1D);

917

mAccF1 = _mm256_and_si256(mAccF1, m1C);

918

mAccF1 = _mm256_and_si256(mAccF1, m1D);

919

mAccF0 = _mm256_and_si256(mAccF0, m1A);

920

mAccF0 = _mm256_and_si256(mAccF0, m1B);

924

}

while

(src < src_end);

926

__m256i maskF = _mm256_set1_epi32(~0u);

927

mAccF0 = _mm256_and_si256(mAccF0, mAccF1);

928

__m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);

929  unsigned

maskA = unsigned(_mm256_movemask_epi8(wcmpA));

930  return

(maskA == ~0u);

945

__m256i m1A, m1B, m1C, m1D;

946

__m256i mAccF0 = _mm256_set1_epi32(~0u);

947

__m256i mAccF1 = _mm256_set1_epi32(~0u);

953

m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));

954

m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));

955

m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));

956

m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));

958

_mm256_store_si256(dst+0, m1A);

959

_mm256_store_si256(dst+1, m1B);

960

_mm256_store_si256(dst+2, m1C);

961

_mm256_store_si256(dst+3, m1D);

963

mAccF1 = _mm256_and_si256(mAccF1, m1C);

964

mAccF1 = _mm256_and_si256(mAccF1, m1D);

965

mAccF0 = _mm256_and_si256(mAccF0, m1A);

966

mAccF0 = _mm256_and_si256(mAccF0, m1B);

968

src1 += 4; src2 += 4; dst += 4;

970

}

while

(src1 < src_end1);

972

__m256i maskF = _mm256_set1_epi32(~0u);

973

mAccF0 = _mm256_and_si256(mAccF0, mAccF1);

974

__m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);

975  unsigned

maskA = unsigned(_mm256_movemask_epi8(wcmpA));

976  return

(maskA == ~0u);

991

__m256i m1A, m1B, m1C, m1D;

992

__m256i mAccF0 = _mm256_set1_epi32(~0u);

993

__m256i mAccF1 = _mm256_set1_epi32(~0u);

999

m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));

1000

m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));

1001

m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));

1002

m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));

1004

m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));

1005

m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));

1006

m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));

1007

m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));

1009

_mm256_store_si256(dst+0, m1A);

1010

_mm256_store_si256(dst+1, m1B);

1011

_mm256_store_si256(dst+2, m1C);

1012

_mm256_store_si256(dst+3, m1D);

1014

mAccF1 = _mm256_and_si256(mAccF1, m1C);

1015

mAccF1 = _mm256_and_si256(mAccF1, m1D);

1016

mAccF0 = _mm256_and_si256(mAccF0, m1A);

1017

mAccF0 = _mm256_and_si256(mAccF0, m1B);

1019

src1 += 4; src2 += 4; dst += 4;

1021

}

while

(src1 < src_end1);

1023

__m256i maskF = _mm256_set1_epi32(~0u);

1024

mAccF0 = _mm256_and_si256(mAccF0, mAccF1);

1025

__m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);

1026  unsigned

maskA = unsigned(_mm256_movemask_epi8(wcmpA));

1027  return

(maskA == ~0u);

1045

__m256i m1A, m1B, m1C, m1D;

1046

__m256i mAccF0 = _mm256_set1_epi32(~0u);

1047

__m256i mAccF1 = _mm256_set1_epi32(~0u);

1054

m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));

1055

m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));

1056

m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));

1057

m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));

1059

m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));

1060

m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));

1061

m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));

1062

m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));

1064

m1A = _mm256_or_si256(m1A, _mm256_load_si256(src3+0));

1065

m1B = _mm256_or_si256(m1B, _mm256_load_si256(src3+1));

1066

m1C = _mm256_or_si256(m1C, _mm256_load_si256(src3+2));

1067

m1D = _mm256_or_si256(m1D, _mm256_load_si256(src3+3));

1069

m1A = _mm256_or_si256(m1A, _mm256_load_si256(src4+0));

1070

m1B = _mm256_or_si256(m1B, _mm256_load_si256(src4+1));

1071

m1C = _mm256_or_si256(m1C, _mm256_load_si256(src4+2));

1072

m1D = _mm256_or_si256(m1D, _mm256_load_si256(src4+3));

1074

_mm256_stream_si256(dst+0, m1A);

1075

_mm256_stream_si256(dst+1, m1B);

1076

_mm256_stream_si256(dst+2, m1C);

1077

_mm256_stream_si256(dst+3, m1D);

1079

mAccF1 = _mm256_and_si256(mAccF1, m1C);

1080

mAccF1 = _mm256_and_si256(mAccF1, m1D);

1081

mAccF0 = _mm256_and_si256(mAccF0, m1A);

1082

mAccF0 = _mm256_and_si256(mAccF0, m1B);

1084

src1 += 4; src2 += 4;

1085

src3 += 4; src4 += 4;

1091

}

while

(src1 < src_end1);

1093

__m256i maskF = _mm256_set1_epi32(~0u);

1094

mAccF0 = _mm256_and_si256(mAccF0, mAccF1);

1095

__m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);

1096  unsigned

maskA = unsigned(_mm256_movemask_epi8(wcmpA));

1097  return

(maskA == ~0u);

1111

__m256i m1A, m1B, m1C, m1D;

1112

__m256i accA, accB, accC, accD;

1117

accA = accB = accC = accD = _mm256_setzero_si256();

1121

m1A = _mm256_xor_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));

1122

m1B = _mm256_xor_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));

1123

m1C = _mm256_xor_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));

1124

m1D = _mm256_xor_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));

1126

_mm256_store_si256(dst+0, m1A);

1127

_mm256_store_si256(dst+1, m1B);

1128

_mm256_store_si256(dst+2, m1C);

1129

_mm256_store_si256(dst+3, m1D);

1131

accA = _mm256_or_si256(accA, m1A);

1132

accB = _mm256_or_si256(accB, m1B);

1133

accC = _mm256_or_si256(accC, m1C);

1134

accD = _mm256_or_si256(accD, m1D);

1138

}

while

(src < src_end);

1140

accA = _mm256_or_si256(accA, accB);

1141

accC = _mm256_or_si256(accC, accD);

1142

accA = _mm256_or_si256(accA, accC);

1144  return

!_mm256_testz_si256(accA, accA);

1158

__m256i m1A, m1B, m1C, m1D;

1159

__m256i accA, accB, accC, accD;

1164

accA = accB = accC = accD = _mm256_setzero_si256();

1168

m1A = _mm256_xor_si256(_mm256_load_si256(src1 + 0), _mm256_load_si256(src2 + 0));

1169

m1B = _mm256_xor_si256(_mm256_load_si256(src1 + 1), _mm256_load_si256(src2 + 1));

1170

m1C = _mm256_xor_si256(_mm256_load_si256(src1 + 2), _mm256_load_si256(src2 + 2));

1171

m1D = _mm256_xor_si256(_mm256_load_si256(src1 + 3), _mm256_load_si256(src2 + 3));

1173

_mm256_store_si256(dst + 0, m1A);

1174

_mm256_store_si256(dst + 1, m1B);

1175

_mm256_store_si256(dst + 2, m1C);

1176

_mm256_store_si256(dst + 3, m1D);

1178

accA = _mm256_or_si256(accA, m1A);

1179

accB = _mm256_or_si256(accB, m1B);

1180

accC = _mm256_or_si256(accC, m1C);

1181

accD = _mm256_or_si256(accD, m1D);

1183

src1 += 4; src2 += 4; dst += 4;

1185

}

while

(src1 < src1_end);

1187

accA = _mm256_or_si256(accA, accB);

1188

accC = _mm256_or_si256(accC, accD);

1189

accA = _mm256_or_si256(accA, accC);

1191  return

!_mm256_testz_si256(accA, accA);

1207

__m256i m1A, m1B, m1C, m1D;

1208

__m256i accA, accB, accC, accD;

1210

accA = accB = accC = accD = _mm256_setzero_si256();

1217

m1A = _mm256_andnot_si256(_mm256_load_si256(src), _mm256_load_si256(dst));

1218

m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));

1219

m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));

1220

m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));

1222

_mm256_store_si256(dst+2, m1C);

1223

_mm256_store_si256(dst+3, m1D);

1224

_mm256_store_si256(dst+0, m1A);

1225

_mm256_store_si256(dst+1, m1B);

1227

accA = _mm256_or_si256(accA, m1A);

1228

accB = _mm256_or_si256(accB, m1B);

1229

accC = _mm256_or_si256(accC, m1C);

1230

accD = _mm256_or_si256(accD, m1D);

1233

}

while

(src < src_end);

1235

accA = _mm256_or_si256(accA, accB);

1236

accC = _mm256_or_si256(accC, accD);

1237

accA = _mm256_or_si256(accA, accC);

1239  return

!_mm256_testz_si256(accA, accA);

1253

__m256i m1A, m1B, m1C, m1D;

1255

m1A = _mm256_andnot_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));

1256

m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));

1257

m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));

1258

m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));

1260

_mm256_store_si256(dst+0, m1A);

1261

_mm256_store_si256(dst+1, m1B);

1262

_mm256_store_si256(dst+2, m1C);

1263

_mm256_store_si256(dst+3, m1D);

1265

m1A = _mm256_or_si256(m1A, m1B);

1266

m1C = _mm256_or_si256(m1C, m1D);

1267

m1A = _mm256_or_si256(m1A, m1C);

1269  return

_mm256_testz_si256(m1A, m1A);

1284

__m256i m1A, m1B, m1C, m1D;

1286

m1A = _mm256_andnot_si256(_mm256_load_si256(src2+0), _mm256_load_si256(src1+0));

1287

m1B = _mm256_andnot_si256(_mm256_load_si256(src2+1), _mm256_load_si256(src1+1));

1288

m1C = _mm256_andnot_si256(_mm256_load_si256(src2+2), _mm256_load_si256(src1+2));

1289

m1D = _mm256_andnot_si256(_mm256_load_si256(src2+3), _mm256_load_si256(src1+3));

1291

_mm256_store_si256(dst+0, m1A);

1292

_mm256_store_si256(dst+1, m1B);

1293

_mm256_store_si256(dst+2, m1C);

1294

_mm256_store_si256(dst+3, m1D);

1296

m1A = _mm256_or_si256(m1A, m1B);

1297

m1C = _mm256_or_si256(m1C, m1D);

1298

m1A = _mm256_or_si256(m1A, m1C);

1300  return

_mm256_testz_si256(m1A, m1A);

1316

__m256i m1A, m1B, m1C, m1D;

1317

__m256i m1E, m1F, m1G, m1H;

1318  const

__m256i maskF = _mm256_set1_epi32(~0u);

1321

__m256i s1_0, s2_0, s1_1, s2_1;

1323

s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);

1324

s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);

1325

s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);

1326

s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);

1328

m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);

1330

s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);

1331

s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);

1332

s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);

1333

s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);

1335

m1C = _mm256_and_si256(s1_0, s2_0);

1336

m1D = _mm256_and_si256(s1_1, s2_1);

1339

__m256i s3_0, s4_0, s3_1, s4_1;

1341

s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);

1342

s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);

1343

s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);

1344

s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);

1346

m1E = _mm256_and_si256(s3_0, s4_0);

1347

m1F = _mm256_and_si256(s3_1, s4_1);

1349

m1A = _mm256_and_si256(m1A, m1E);

1350

m1B = _mm256_and_si256(m1B, m1F);

1352

s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);

1353

s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);

1354

s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);

1355

s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);

1357

m1G = _mm256_and_si256(s3_0, s4_0);

1358

m1H = _mm256_and_si256(s3_1, s4_1);

1362

dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);

1364

m1C = _mm256_and_si256(m1C, m1G);

1365

m1D = _mm256_and_si256(m1D, m1H);

1366

m1A = _mm256_and_si256(m1A, dst0);

1367

m1B = _mm256_and_si256(m1B, dst1);

1369

dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);

1371

m1C = _mm256_and_si256(m1C, dst0);

1372

m1D = _mm256_and_si256(m1D, dst1);

1374

_mm256_store_si256(dst + 0, m1A);

1375

_mm256_store_si256(dst + 1, m1B);

1376

_mm256_store_si256(dst + 2, m1C);

1377

_mm256_store_si256(dst + 3, m1D);

1379

m1A = _mm256_or_si256(m1A, m1B);

1380

m1C = _mm256_or_si256(m1C, m1D);

1381

m1A = _mm256_or_si256(m1A, m1C);

1383  return

_mm256_testz_si256(m1A, m1A);

1396

__m256i m1A, m1B, m1C, m1D;

1398  const

__m256i maskF = _mm256_set1_epi32(~0u);

1401

__m256i s1_0, s2_0, s1_1, s2_1;

1403

s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);

1404

s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);

1405

s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);

1406

s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);

1408

m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);

1410

s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);

1411

s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);

1412

s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);

1413

s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);

1415

m1C = _mm256_and_si256(s1_0, s2_0);

1416

m1D = _mm256_and_si256(s1_1, s2_1);

1444

dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);

1448

m1A = _mm256_and_si256(m1A, dst0);

1449

m1B = _mm256_and_si256(m1B, dst1);

1451

dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);

1453

m1C = _mm256_and_si256(m1C, dst0);

1454

m1D = _mm256_and_si256(m1D, dst1);

1456

_mm256_store_si256(dst + 0, m1A);

1457

_mm256_store_si256(dst + 1, m1B);

1458

_mm256_store_si256(dst + 2, m1C);

1459

_mm256_store_si256(dst + 3, m1D);

1461

m1A = _mm256_or_si256(m1A, m1B);

1462

m1C = _mm256_or_si256(m1C, m1D);

1463

m1A = _mm256_or_si256(m1A, m1C);

1465  return

_mm256_testz_si256(m1A, m1A);

1482

__m256i ymm0 = _mm256_set1_epi32(

int

(

value

));

1485

_mm256_store_si256(dst, ymm0);

1486

_mm256_store_si256(dst+1, ymm0);

1487

_mm256_store_si256(dst+2, ymm0);

1488

_mm256_store_si256(dst+3, ymm0);

1491

}

while

(dst < dst_end);

1506

__m256i ymm0, ymm1, ymm2, ymm3;

1513

ymm0 = _mm256_load_si256(src+0);

1514

ymm1 = _mm256_load_si256(src+1);

1515

ymm2 = _mm256_load_si256(src+2);

1516

ymm3 = _mm256_load_si256(src+3);

1518

_mm256_store_si256(dst+0, ymm0);

1519

_mm256_store_si256(dst+1, ymm1);

1520

_mm256_store_si256(dst+2, ymm2);

1521

_mm256_store_si256(dst+3, ymm3);

1523

ymm0 = _mm256_load_si256(src+4);

1524

ymm1 = _mm256_load_si256(src+5);

1525

ymm2 = _mm256_load_si256(src+6);

1526

ymm3 = _mm256_load_si256(src+7);

1528

_mm256_store_si256(dst+4, ymm0);

1529

_mm256_store_si256(dst+5, ymm1);

1530

_mm256_store_si256(dst+6, ymm2);

1531

_mm256_store_si256(dst+7, ymm3);

1535

}

while

(src < src_end);

1548

__m256i ymm0, ymm1, ymm2, ymm3;

1555

ymm0 = _mm256_loadu_si256(src+0);

1556

ymm1 = _mm256_loadu_si256(src+1);

1557

ymm2 = _mm256_loadu_si256(src+2);

1558

ymm3 = _mm256_loadu_si256(src+3);

1560

_mm256_store_si256(dst+0, ymm0);

1561

_mm256_store_si256(dst+1, ymm1);

1562

_mm256_store_si256(dst+2, ymm2);

1563

_mm256_store_si256(dst+3, ymm3);

1565

ymm0 = _mm256_loadu_si256(src+4);

1566

ymm1 = _mm256_loadu_si256(src+5);

1567

ymm2 = _mm256_loadu_si256(src+6);

1568

ymm3 = _mm256_loadu_si256(src+7);

1570

_mm256_store_si256(dst+4, ymm0);

1571

_mm256_store_si256(dst+5, ymm1);

1572

_mm256_store_si256(dst+6, ymm2);

1573

_mm256_store_si256(dst+7, ymm3);

1577

}

while

(src < src_end);

1592

__m256i ymm0, ymm1, ymm2, ymm3;

1599

ymm0 = _mm256_load_si256(src+0);

1600

ymm1 = _mm256_load_si256(src+1);

1601

ymm2 = _mm256_load_si256(src+2);

1602

ymm3 = _mm256_load_si256(src+3);

1604

_mm256_stream_si256(dst+0, ymm0);

1605

_mm256_stream_si256(dst+1, ymm1);

1606

_mm256_stream_si256(dst+2, ymm2);

1607

_mm256_stream_si256(dst+3, ymm3);

1609

ymm0 = _mm256_load_si256(src+4);

1610

ymm1 = _mm256_load_si256(src+5);

1611

ymm2 = _mm256_load_si256(src+6);

1612

ymm3 = _mm256_load_si256(src+7);

1614

_mm256_stream_si256(dst+4, ymm0);

1615

_mm256_stream_si256(dst+5, ymm1);

1616

_mm256_stream_si256(dst+6, ymm2);

1617

_mm256_stream_si256(dst+7, ymm3);

1621

}

while

(src < src_end);

1634

__m256i ymm0, ymm1, ymm2, ymm3;

1641

ymm0 = _mm256_loadu_si256(src+0);

1642

ymm1 = _mm256_loadu_si256(src+1);

1643

ymm2 = _mm256_loadu_si256(src+2);

1644

ymm3 = _mm256_loadu_si256(src+3);

1646

_mm256_stream_si256(dst+0, ymm0);

1647

_mm256_stream_si256(dst+1, ymm1);

1648

_mm256_stream_si256(dst+2, ymm2);

1649

_mm256_stream_si256(dst+3, ymm3);

1651

ymm0 = _mm256_loadu_si256(src+4);

1652

ymm1 = _mm256_loadu_si256(src+5);

1653

ymm2 = _mm256_loadu_si256(src+6);

1654

ymm3 = _mm256_loadu_si256(src+7);

1656

_mm256_stream_si256(dst+4, ymm0);

1657

_mm256_stream_si256(dst+5, ymm1);

1658

_mm256_stream_si256(dst+6, ymm2);

1659

_mm256_stream_si256(dst+7, ymm3);

1663

}

while

(src < src_end);

1679

__m256i maskFF = _mm256_set1_epi32(-1);

1686

ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+0), maskFF);

1687

ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+1), maskFF);

1689

_mm256_store_si256(dst+0, ymm0);

1690

_mm256_store_si256(dst+1, ymm1);

1692

ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+2), maskFF);

1693

ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+3), maskFF);

1695

_mm256_store_si256(dst+2, ymm0);

1696

_mm256_store_si256(dst+3, ymm1);

1700

}

while

(dst < dst_end);

1715

__m256i w0 = _mm256_load_si256(block+0);

1716

__m256i w1 = _mm256_load_si256(block+1);

1718

__m256i wA = _mm256_or_si256(w0, w1);

1720

__m256i w2 = _mm256_load_si256(block+2);

1721

__m256i w3 = _mm256_load_si256(block+3);

1723

__m256i wB = _mm256_or_si256(w2, w3);

1724

wA = _mm256_or_si256(wA, wB);

1726  if

(!_mm256_testz_si256(wA, wA))

1729

}

while

(block < block_end);

1740

__m256i wA = _mm256_or_si256(_mm256_load_si256(block+0), _mm256_load_si256(block+1));

1741

__m256i wB = _mm256_or_si256(_mm256_load_si256(block+2), _mm256_load_si256(block+3));

1742

wA = _mm256_or_si256(wA, wB);

1744  return

_mm256_testz_si256(wA, wA);

1754

__m256i mV = _mm256_set1_epi32(

int

(

value

));

1755

_mm256_store_si256(dst, mV);

1756

_mm256_store_si256(dst + 1, mV);

1757

_mm256_store_si256(dst + 2, mV);

1758

_mm256_store_si256(dst + 3, mV);

1769  const

__m256i maskF = _mm256_set1_epi32(~0u);

1774

__m256i m1A = _mm256_load_si256(block+0);

1775

__m256i m1B = _mm256_load_si256(block+1);

1776

m1A = _mm256_xor_si256(m1A, maskF);

1777

m1B = _mm256_xor_si256(m1B, maskF);

1778

m1A = _mm256_or_si256(m1A, m1B);

1779  if

(!_mm256_testz_si256(m1A, m1A))

1782

}

while

(block < block_end);

1793

__m256i maskF = _mm256_set1_epi32(~0u);

1794

__m256i wcmpA = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)ptr), maskF);

1795  unsigned

maskA = unsigned(_mm256_movemask_epi8(wcmpA));

1796  return

(maskA == ~0u);

1807

__m256i w0 = _mm256_loadu_si256((__m256i*)ptr);

1808  return

_mm256_testz_si256(w0, w0);

1818

__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);

1819

__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);

1820

w0 = _mm256_or_si256(w0, w1);

1821  return

_mm256_testz_si256(w0, w0);

1831

__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);

1832

__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);

1833

w0 = _mm256_xor_si256(w0, w1);

1834  return

_mm256_testz_si256(w0, w0);

1844

__m256i* block_end =

1847

__m256i m1COshft, m2COshft;

1848

__m256i mAcc = _mm256_set1_epi32(0);

1849

__m256i mMask1 = _mm256_set1_epi32(1);

1850

__m256i mCOidx = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);

1853  for

(--block_end; block_end >= block; block_end -= 2)

1855

__m256i m1A = _mm256_load_si256(block_end);

1856

__m256i m2A = _mm256_load_si256(block_end-1);

1858

__m256i m1CO = _mm256_and_si256(m1A, mMask1);

1859

__m256i m2CO = _mm256_and_si256(m2A, mMask1);

1861

co2 = _mm256_extract_epi32(m1CO, 0);

1863

m1A = _mm256_srli_epi32(m1A, 1);

1864

m2A = _mm256_srli_epi32(m2A, 1);

1867

m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);

1868

m1COshft = _mm256_insert_epi32(m1COshft, co1, 7);

1872

co2 = _mm256_extract_epi32(m2CO, 0);

1874

m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);

1875

m2COshft = _mm256_insert_epi32(m2COshft, co1, 7);

1877

m1COshft = _mm256_slli_epi32(m1COshft, 31);

1878

m2COshft = _mm256_slli_epi32(m2COshft, 31);

1880

m1A = _mm256_or_si256(m1A, m1COshft);

1881

m2A = _mm256_or_si256(m2A, m2COshft);

1883

_mm256_store_si256(block_end, m1A);

1884

_mm256_store_si256(block_end-1, m2A);

1886

mAcc = _mm256_or_si256(mAcc, m1A);

1887

mAcc = _mm256_or_si256(mAcc, m2A);

1893

*empty_acc = !_mm256_testz_si256(mAcc, mAcc);

1905  const

__m256i* block_end =

1908

__m256i m1COshft, m2COshft;

1909

__m256i mAcc = _mm256_set1_epi32(0);

1910

__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);

1913  for

(;block < block_end; block+=2)

1915

__m256i m1A = _mm256_load_si256(block);

1916

__m256i m2A = _mm256_load_si256(block+1);

1918

__m256i m1CO = _mm256_srli_epi32(m1A, 31);

1919

__m256i m2CO = _mm256_srli_epi32(m2A, 31);

1921

co2 = _mm256_extract_epi32(m1CO, 7);

1923

m1A = _mm256_slli_epi32(m1A, 1);

1924

m2A = _mm256_slli_epi32(m2A, 1);

1927

m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);

1928

m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);

1932

co2 = _mm256_extract_epi32(m2CO, 7);

1933

m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);

1934

m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);

1936

m1A = _mm256_or_si256(m1A, m1COshft);

1937

m2A = _mm256_or_si256(m2A, m2COshft);

1939

_mm256_store_si256(block, m1A);

1940

_mm256_store_si256(block+1, m2A);

1942

mAcc = _mm256_or_si256(mAcc, m1A);

1943

mAcc = _mm256_or_si256(mAcc, m2A);

1948

*empty_acc = !_mm256_testz_si256(mAcc, mAcc);

1969

__m256i m1COshft, m2COshft;

1970

__m256i mAcc = _mm256_set1_epi32(0);

1971

__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);

1976  unsigned

di = co1 ? 0 : unsigned(_tzcnt_u64(d));

1977  for

(; di < 64 ; ++di)

1983

mAcc = _mm256_xor_si256(mAcc, mAcc);

1985

mask_block = (__m256i*) &mblock[d_base];

1988

block = (__m256i*) &wblock[d_base];

1990  for

(

unsigned i

= 0;

i

< 2; ++

i

, block += 2, mask_block += 2)

1992

__m256i m1A = _mm256_load_si256(block);

1993

__m256i m2A = _mm256_load_si256(block+1);

1995

__m256i m1CO = _mm256_srli_epi32(m1A, 31);

1996

__m256i m2CO = _mm256_srli_epi32(m2A, 31);

1998

co2 = _mm256_extract_epi32(m1CO, 7);

2000

m1A = _mm256_slli_epi32(m1A, 1);

2001

m2A = _mm256_slli_epi32(m2A, 1);

2003

__m256i m1M = _mm256_load_si256(mask_block);

2004

__m256i m2M = _mm256_load_si256(mask_block+1);

2007

m1COshft = _mm256_insert_epi32(

2008

_mm256_permutevar8x32_epi32(m1CO, mCOidx),

2012

co2 = _mm256_extract_epi32(m2CO, 7);

2013

m2COshft = _mm256_insert_epi32(

2014

_mm256_permutevar8x32_epi32(m2CO, mCOidx),

2017

m1A = _mm256_or_si256(m1A, m1COshft);

2018

m2A = _mm256_or_si256(m2A, m2COshft);

2020

m1A = _mm256_and_si256(m1A, m1M);

2021

m2A = _mm256_and_si256(m2A, m2M);

2023

_mm256_store_si256(block, m1A);

2024

_mm256_store_si256(block+1, m2A);

2026

mAcc = _mm256_or_si256(mAcc, m1A);

2027

mAcc = _mm256_or_si256(mAcc, m2A);

2033  if

(_mm256_testz_si256(mAcc, mAcc))

2045  bm::id64_t

w0 = wblock[d_base] = (co1 & mblock[d_base]);

2046

d |= (dmask & (w0 << di));

2088  const

__m256i* block_end =

2091

__m256i m1COshft, m2COshft;

2092

__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);

2093

__m256i cntAcc = _mm256_setzero_si256();

2100  unsigned

co2, co1 = 0;

2101  for

(;block < block_end; block+=2)

2103

__m256i m1A = _mm256_load_si256(block);

2104

__m256i m2A = _mm256_load_si256(block+1);

2106

__m256i m1CO = _mm256_srli_epi32(m1A, 31);

2107

__m256i m2CO = _mm256_srli_epi32(m2A, 31);

2109

co2 = _mm256_extract_epi32(m1CO, 7);

2111

__m256i m1As = _mm256_slli_epi32(m1A, 1);

2112

__m256i m2As = _mm256_slli_epi32(m2A, 1);

2115

m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);

2116

m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);

2120

co2 = _mm256_extract_epi32(m2CO, 7);

2121

m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);

2122

m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);

2124

m1As = _mm256_or_si256(m1As, m1COshft);

2125

m2As = _mm256_or_si256(m2As, m2COshft);

2130

m1A = _mm256_xor_si256(m1A, m1As);

2131

m2A = _mm256_xor_si256(m2A, m2As);

2135

cntAcc = _mm256_add_epi64(cntAcc, bc);

2137

cntAcc = _mm256_add_epi64(cntAcc, bc);

2142

_mm256_store_si256 ((__m256i*)cnt_v, cntAcc);

2143  count

+= (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);

2165

__m256i m1COshft, m2COshft;

2166

__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);

2168

__m256i cntAcc = _mm256_setzero_si256();

2169

__m256i cntAcc2 = _mm256_setzero_si256();

2172  unsigned

bit_count = 0;

2173  unsigned

gap_count = 1;

2177  unsigned

co2, co1 = 0;

2178  for

(;block < block_end; block+=2, xor_block+=2)

2180

__m256i m1A = _mm256_load_si256(block);

2181

__m256i m2A = _mm256_load_si256(block+1);

2182

__m256i m1B = _mm256_load_si256(xor_block);

2183

__m256i m2B = _mm256_load_si256(xor_block+1);

2185

m1A = _mm256_xor_si256 (m1A, m1B);

2186

m2A = _mm256_xor_si256 (m2A, m2B);

2190

cntAcc2 = _mm256_add_epi64(cntAcc2, bc);

2192

cntAcc2 = _mm256_add_epi64(cntAcc2, bc);

2195

__m256i m1CO = _mm256_srli_epi32(m1A, 31);

2196

__m256i m2CO = _mm256_srli_epi32(m2A, 31);

2198

co2 = _mm256_extract_epi32(m1CO, 7);

2200

__m256i m1As = _mm256_slli_epi32(m1A, 1);

2201

__m256i m2As = _mm256_slli_epi32(m2A, 1);

2204

m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);

2205

m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);

2209

co2 = _mm256_extract_epi32(m2CO, 7);

2210

m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);

2211

m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);

2213

m1As = _mm256_or_si256(m1As, m1COshft);

2214

m2As = _mm256_or_si256(m2As, m2COshft);

2219

m1A = _mm256_xor_si256(m1A, m1As);

2220

m2A = _mm256_xor_si256(m2A, m2As);

2224

cntAcc = _mm256_add_epi64(cntAcc, bc);

2226

cntAcc = _mm256_add_epi64(cntAcc, bc);

2231

_mm256_store_si256 ((__m256i*)cnt_v, cntAcc);

2232

gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);

2233

gap_count -= (w0 & 1u);

2237

_mm256_store_si256 ((__m256i*)cnt_v, cntAcc2);

2238

bit_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);

2240

*gcount = gap_count;

2241

*bcount = bit_count;

2252  unsigned

* gcount,

unsigned

* bcount)

2256  const

__m256i* block_end =

2259

__m256i m1COshft, m2COshft;

2260

__m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);

2261

__m256i cntAcc = _mm256_setzero_si256();

2264  unsigned

bit_count = 0;

2265  unsigned

gap_count = 1;

2269  unsigned

co2, co1 = 0;

2270  for

(;block < block_end; block+=2)

2272

__m256i m1A = _mm256_load_si256(block);

2273

__m256i m2A = _mm256_load_si256(block+1);

2286

__m256i m1CO = _mm256_srli_epi32(m1A, 31);

2287

__m256i m2CO = _mm256_srli_epi32(m2A, 31);

2289

co2 = _mm256_extract_epi32(m1CO, 7);

2291

__m256i m1As = _mm256_slli_epi32(m1A, 1);

2292

__m256i m2As = _mm256_slli_epi32(m2A, 1);

2295

m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);

2296

m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);

2300

co2 = _mm256_extract_epi32(m2CO, 7);

2301

m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);

2302

m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);

2304

m1As = _mm256_or_si256(m1As, m1COshft);

2305

m2As = _mm256_or_si256(m2As, m2COshft);

2310

m1A = _mm256_xor_si256(m1A, m1As);

2311

m2A = _mm256_xor_si256(m2A, m2As);

2315

cntAcc = _mm256_add_epi64(cntAcc, bc);

2317

cntAcc = _mm256_add_epi64(cntAcc, bc);

2322

_mm256_store_si256 ((__m256i*)cnt_v, cntAcc);

2323

gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);

2324

gap_count -= (w0 & 1u);

2326

*gcount = gap_count;

2327

*bcount = bit_count;

2342  const

__m256i* block1_end =

2344

__m256i maskZ = _mm256_setzero_si256();

2346  unsigned

simd_lane = 0;

2349

mA = _mm256_xor_si256(_mm256_load_si256(block1),

2350

_mm256_load_si256(block2));

2351

mB = _mm256_xor_si256(_mm256_load_si256(block1+1),

2352

_mm256_load_si256(block2+1));

2353

__m256i mOR = _mm256_or_si256(mA, mB);

2354  if

(!_mm256_testz_si256(mOR, mOR))

2356  if

(!_mm256_testz_si256(mA, mA))

2359  unsigned mask

= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));

2361  int

bsf = bm::bsf_asm32(

mask

);

2362

_mm256_store_si256 ((__m256i*)simd_buf, mA);

2363  unsigned

widx = bsf >> 2;

2364  unsigned

w = simd_buf[widx];

2365

bsf = bm::bsf_asm32(w);

2366

*pos = (simd_lane * 256) + (widx * 32) + bsf;

2370  unsigned mask

= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));

2372  int

bsf = bm::bsf_asm32(

mask

);

2373

_mm256_store_si256 ((__m256i*)simd_buf, mB);

2374  unsigned

widx = bsf >> 2;

2375  unsigned

w = simd_buf[widx];

2376

bsf = bm::bsf_asm32(w);

2377

*pos = ((++simd_lane) * 256) + (widx * 32) + bsf;

2382

block1+=2; block2+=2;

2384

}

while

(block1 < block1_end);

2398

block = (

const

__m256i*)((

bm::word_t

*)(block) + off);

2399  const

__m256i* block_end =

2401

__m256i maskZ = _mm256_setzero_si256();

2403  unsigned

simd_lane = 0;

2406

mA = _mm256_load_si256(block); mB = _mm256_load_si256(block+1);

2407

__m256i mOR = _mm256_or_si256(mA, mB);

2408  if

(!_mm256_testz_si256(mOR, mOR))

2410  if

(!_mm256_testz_si256(mA, mA))

2413  unsigned mask

= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));

2415  int

bsf = bm::bsf_asm32(

mask

);

2416

_mm256_store_si256 ((__m256i*)simd_buf, mA);

2417  unsigned

widx = bsf >> 2;

2418  unsigned

w = simd_buf[widx];

2419

bsf = bm::bsf_asm32(w);

2420

*pos = (off * 32) + (simd_lane * 256) + (widx * 32) + bsf;

2424  unsigned mask

= ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));

2426  int

bsf = bm::bsf_asm32(

mask

);

2427

_mm256_store_si256 ((__m256i*)simd_buf, mB);

2428  unsigned

widx = bsf >> 2;

2429  unsigned

w = simd_buf[widx];

2430

bsf = bm::bsf_asm32(w);

2431

*pos = (off * 32) + ((++simd_lane) * 256) + (widx * 32) + bsf;

2438

}

while

(block < block_end);

2454  unsigned

avx_vect_waves,

2457

__m256i xcnt = _mm256_setzero_si256();

2462  for

(

unsigned i

= 0;

i

< avx_vect_waves; ++

i

)

2464

__m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));

2465

__m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));

2466

__m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);

2467

xcnt = _mm256_add_epi16(xcnt, ymm_s2);

2472

xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);

2477

xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);

2478

xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);

2479  __m128i

xcnt2 =

_mm_add_epi16

(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));

2493  unsigned

nb,

unsigned

start)

2495  const unsigned

unroll_factor = 16;

2496  const unsigned len

= (

size

- start);

2497  const unsigned

len_unr =

len

- (

len

% unroll_factor);

2502

__m256i nbM = _mm256_set1_epi32(

int

(nb));

2504  for

(k = 0; k < len_unr; k+=unroll_factor)

2506

__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));

2509

__m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);

2510  if

(~0u !=

unsigned

(_mm256_movemask_epi8(wcmpA)))

2512

__m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));

2515

__m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);

2516  if

(~0u !=

unsigned

(_mm256_movemask_epi8(wcmpB)))

2519  for

(; k <

len

; ++k)

2535  unsigned

start,

unsigned

stop )

2537  const unsigned

unroll_factor = 8;

2538  const unsigned len

= (stop - start);

2539  const unsigned

len_unr =

len

- (

len

% unroll_factor);

2545

__m256i mask1 = _mm256_set1_epi32(1);

2551  unsigned

k = 0,

mask

, w_idx;

2552  for

(; k < len_unr; k+=unroll_factor)

2554

__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));

2555

__m256i nbitA = _mm256_and_si256 (idxA, sb_mask);

2558

nbitA = _mm256_and_si256 (nbitA, sw_mask);

2560

__m256i maskA = _mm256_sllv_epi32(mask1, nbitA);

2562

_mm256_store_si256 ((__m256i*)mword_v, nwordA);

2565

mask_tmp = _mm256_shuffle_epi32 (nwordA,

_MM_SHUFFLE

(1,1,1,1));

2566

mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);

2567  mask

= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));

2571

mask_tmp = _mm256_xor_si256 (mask_tmp, mask_tmp);

2572

mask_tmp = _mm256_or_si256 (mask_tmp, maskA);

2576

__m256i mtmp0 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 0);

2577

__m256i mtmp1 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 1);

2578

mask_tmp = _mm256_or_si256 (mtmp0, mtmp1);

2579

mtmp0 = _mm256_bsrli_epi128(mask_tmp, 4);

2580

mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);

2581

mtmp0 = _mm256_bsrli_epi128(mask_tmp, 8);

2582

mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);

2584  int

u0 = _mm256_extract_epi32(mask_tmp, 0);

2589

_mm256_store_si256 ((__m256i*)mask_v, maskA);

2597

mask_tmp = _mm256_bsrli_epi128(maskA, 4);

2598

mask_tmp = _mm256_or_si256 (mask_tmp, maskA);

2599

__m256i m0 = _mm256_bsrli_epi128(mask_tmp, 8);

2600

mask_tmp = _mm256_or_si256 (m0, mask_tmp);

2602

u0 = _mm256_extract_epi32(mask_tmp, 0);

2603

u4 = _mm256_extract_epi32(mask_tmp, 4);

2608

mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 0);

2609

__m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);

2610  mask

= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));

2618

block[mword_v[0]] |= mask_v[0];

2619

block[mword_v[1]] |= mask_v[1];

2620

block[mword_v[2]] |= mask_v[2];

2621

block[mword_v[3]] |= mask_v[3];

2628

mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 1);

2629

__m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);

2630  mask

= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));

2638

block[mword_v[4]] |= mask_v[4];

2639

block[mword_v[5]] |= mask_v[5];

2640

block[mword_v[6]] |= mask_v[6];

2641

block[mword_v[7]] |= mask_v[7];

2647  for

(; k <

len

; ++k)

2649  unsigned n

= idx[k];

2653

block[nword] |= (1u << nbit);

2664

__m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);

2665

__m256i mask1 = _mm256_set1_epi32(1);

2667

__m256i v0, v1, acc1, acc2;

2668

v0 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(0));

2669

v1 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(1));

2670

v0 = _mm256_sub_epi32(v0, stride_idx);

2671

v1 = _mm256_sub_epi32(v1, stride_idx);

2672

v0 = _mm256_sllv_epi32(mask1, v0);

2673

v1 = _mm256_sllv_epi32(mask1, v1);

2674

acc1 = _mm256_or_si256(v1, v0);

2675

v0 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(2));

2676

v1 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(3));

2677

v0 = _mm256_sub_epi32(v0, stride_idx);

2678

v1 = _mm256_sub_epi32(v1, stride_idx);

2679

v0 = _mm256_sllv_epi32(mask1, v0);

2680

v1 = _mm256_sllv_epi32(mask1, v1);

2681

acc2 = _mm256_or_si256(v1, v0);

2682

target = _mm256_or_si256(target, acc1);

2683

v0 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(4));

2684

v1 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(5));

2685

v0 = _mm256_sub_epi32(v0, stride_idx);

2686

v1 = _mm256_sub_epi32(v1, stride_idx);

2687

v0 = _mm256_sllv_epi32(mask1, v0);

2688

v1 = _mm256_sllv_epi32(mask1, v1);

2689

acc1 = _mm256_or_si256(v1, v0);

2690

target = _mm256_or_si256(target, acc2);

2691

v0 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(6));

2692

v1 = _mm256_permutevar8x32_epi32(

source

, _mm256_set1_epi32(7));

2693

v0 = _mm256_sub_epi32(v0, stride_idx);

2694

v1 = _mm256_sub_epi32(v1, stride_idx);

2695

v0 = _mm256_sllv_epi32(mask1, v0);

2696

v1 = _mm256_sllv_epi32(mask1, v1);

2697

acc2 = _mm256_or_si256(v1, v0);

2699

target = _mm256_or_si256(target, acc1);

2700

target = _mm256_or_si256(target, acc2);

2711  unsigned

start,

unsigned

stop )

2713

__m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);

2714

__m256i mask1 = _mm256_set1_epi32(1);

2715

__m256i* block_avx = (__m256i*)block;

2717  unsigned

stride = 0;

2718

__m256i* avx_stride_p = block_avx + stride;

2719

__m256i blkA = _mm256_load_si256(avx_stride_p);

2721  for

(

unsigned i

= start;

i

< stop; ++

i

)

2723  unsigned n

= idx[

i

];

2725  unsigned

new_stride = nbit >> 8;

2726  unsigned

stride_bit = nbit & 0xFF;

2727  if

(new_stride != stride)

2729

_mm256_store_si256(avx_stride_p, blkA);

2730

stride = new_stride;

2731

avx_stride_p = block_avx + stride;

2732

blkA = _mm256_load_si256(avx_stride_p);

2735

__m256i v0 = _mm256_set1_epi32(stride_bit);

2736

__m256i s0 = _mm256_sub_epi32(v0, stride_idx);

2737

__m256i k0 = _mm256_sllv_epi32(mask1, s0);

2738

blkA = _mm256_or_si256(blkA, k0);

2741

_mm256_store_si256(avx_stride_p, blkA);

2750  unsigned

start,

unsigned

stop )

2752  const unsigned

unroll_factor = 8;

2753  const unsigned len

= (stop - start);

2754  const unsigned

len_unr =

len

- (

len

% unroll_factor);

2758

__m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);

2759

__m256i mask1 = _mm256_set1_epi32(1);

2762

__m256i stride_bit_mask = _mm256_set1_epi32(0xFF);

2770

__m256i* block_avx = (__m256i*)block;

2771

__m256i* avx_stride_p = block_avx + stride;

2773

__m256i blkA = _mm256_load_si256(avx_stride_p);

2775  unsigned

k = 0,

mask

;

2776  for

(; k < len_unr; k+=unroll_factor)

2778

__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));

2779

__m256i nbitA = _mm256_and_si256 (idxA, sb_mask);

2780

__m256i strideA = _mm256_srli_epi32 (nbitA, 8);

2781

__m256i strideBitA = _mm256_and_si256 (nbitA, stride_bit_mask);

2784

__m256i mask_tmp = _mm256_shuffle_epi32 (strideA, 0x0);

2785

mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);

2786  mask

= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, strideA));

2789  unsigned

new_stride = (unsigned)_mm256_extract_epi32(strideA, 0);

2790  if

(new_stride != stride)

2792

_mm256_store_si256(avx_stride_p, blkA);

2793

stride = new_stride;

2794

avx_stride_p = block_avx + stride;

2795

blkA = _mm256_load_si256(avx_stride_p);

2802

_mm256_store_si256 ((__m256i*)mstride_bit_v, strideBitA);

2803

_mm256_store_si256 ((__m256i*)mstride_v, strideA);

2804  for

(

unsigned

j = 0; j < 8; ++j)

2806  unsigned

new_stride = mstride_v[j];

2807  if

(new_stride != stride)

2809

_mm256_store_si256(avx_stride_p, blkA);

2810

stride = new_stride;

2811

avx_stride_p = block_avx + stride;

2812

blkA = _mm256_load_si256(avx_stride_p);

2815

mask_tmp = _mm256_set1_epi32(mstride_bit_v[j]);

2816

mask_tmp = _mm256_sub_epi32(mask_tmp, stride_idx);

2817

mask_tmp = _mm256_sllv_epi32(mask1, mask_tmp);

2818

blkA = _mm256_or_si256(blkA, mask_tmp);

2822

_mm256_store_si256(avx_stride_p, blkA);

2825  for

(; k <

len

; ++k)

2827  unsigned n

= idx[k];

2831

block[nword] |= (1u << nbit);

2843

__m256i stride_idx1 = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);

2844

__m256i stride_idx2 = _mm256_add_epi32(stride_idx1, _mm256_set1_epi32(32));

2845

__m256i maskFF = _mm256_set1_epi32(-1);

2846

__m256i maskZ = _mm256_setzero_si256();

2848

__m256i v0 = _mm256_set1_epi32(

i

);

2849

__m256i s0 = _mm256_sub_epi32(v0, stride_idx1);

2850

__m256i k1 = _mm256_sllv_epi32(maskFF, s0);

2853

__m256i cmp_eq = _mm256_cmpeq_epi32(k1, maskZ);

2854

cmp_eq = _mm256_xor_si256(maskFF, cmp_eq);

2855

k1 = _mm256_xor_si256(k1, cmp_eq);

2858

__m256i cmp_gt = _mm256_cmpgt_epi32 (stride_idx2, v0);

2859

cmp_gt = _mm256_xor_si256(maskFF, cmp_gt);

2860

__m256i

r

= _mm256_xor_si256(k1, cmp_gt);

2880

__m256i mask0x8 = _mm256_set1_epi32(0x80000000);

2881

__m256i mm_val = _mm256_set1_epi32(

value

);

2883

__m256i norm_vect8 = _mm256_sub_epi32(vect8, mask0x8);

2884

__m256i norm_val = _mm256_sub_epi32(mm_val, mask0x8);

2886

__m256i cmp_mask_gt = _mm256_cmpgt_epi32(norm_vect8, norm_val);

2887

__m256i cmp_mask_eq = _mm256_cmpeq_epi32(mm_val, vect8);

2889

__m256i cmp_mask_ge = _mm256_or_si256(cmp_mask_gt, cmp_mask_eq);

2890  int mask

= _mm256_movemask_epi8(cmp_mask_ge);

2893  int

bsf = bm::bsf_asm32(

mask

);

2909

__m256i mZ = _mm256_setzero_si256();

2910

__m256i mVal = _mm256_set1_epi16(

value

);

2913

__m256i mSub = _mm256_subs_epu16(mVal, vect16);

2914

__m256i mge_mask = _mm256_cmpeq_epi16(mSub, mZ);

2915  unsigned mask

= _mm256_movemask_epi8(mge_mask);

2918  int

lz = _tzcnt_u32(

mask

);

2938 template

<

bool

RET_TEST=false>

2944  const unsigned

linear_cutoff = 64;

2945  const unsigned

unroll_factor = 16;

2951  unsigned

end = ((*buf) >> 3);

2953  const unsigned

arr_end = end + 1;

2954  if

(end <= unroll_factor)

2956  for

(;

true

; ++start)

2957  if

(

buf

[start] >= pos)

2964  unsigned

dsize = end - start;

2965  for

(; dsize >= 64; dsize = end - start)

2967  unsigned

mid = (start + end) >> 1;

2968  if

(

buf

[mid] < pos)

2972  if

(

buf

[mid = (start + end) >> 1] < pos)

2976  if

(

buf

[mid = (start + end) >> 1] < pos)

2980  if

(

buf

[mid = (start + end) >> 1] < pos)

2987

dsize = end - start + 1;

2988  if

(dsize < linear_cutoff)

2993

dsize = arr_end - start;

2995

__m256i mZ = _mm256_setzero_si256();

2996

__m256i mPos = _mm256_set1_epi16((

unsigned short

)pos);

2997

__m256i vect16, mSub, mge_mask;

2999  for

(

unsigned

len_unr = start + (dsize - (dsize % unroll_factor));

3000

start < len_unr; start += unroll_factor)

3002

vect16 = _mm256_loadu_si256((__m256i*)(&

buf

[start]));

3003

mSub = _mm256_subs_epu16(mPos, vect16);

3004

mge_mask = _mm256_cmpeq_epi16(mSub, mZ);

3005  if

(

int mask

= _mm256_movemask_epi8(mge_mask);

mask

)

3007  int

lz = _tzcnt_u32(

mask

);

3016

vect16 = _mm256_loadu_si256((__m256i*)(&

buf

[start]));

3017

mSub = _mm256_subs_epu16(mPos, vect16);

3018

mge_mask = _mm256_cmpeq_epi16(mSub, mZ);

3019  int mask

= _mm256_movemask_epi8(mge_mask);

3021  int

lz = _tzcnt_u32(

mask

);

3025  for

(;

true

; ++start)

3026  if

(

buf

[start] >= pos)

3031  if

(

unsigned

mid = (start + end) >> 1;

buf

[mid] < pos)

3035  if

(

unsigned

mid = (start + end) >> 1;

buf

[mid] < pos)

3041

res = ((*buf) & 1) ^ ((start-1) & 1);

3042  if

constexpr(RET_TEST)

3059  return

bm::avx2_gap_bfind<true>(

buf

, pos, 0);

3079  unsigned

unroll_factor = 8;

3080  unsigned len

= to - from + 1;

3081  unsigned

len_unr =

len

- (

len

% unroll_factor);

3083

__m256i mask0x8 = _mm256_set1_epi32(0x80000000);

3084

__m256i vect_target = _mm256_set1_epi32(target);

3085

__m256i norm_target = _mm256_sub_epi32(vect_target, mask0x8);

3088

__m256i vect80, norm_vect80, cmp_mask_ge;

3091  for

(; k < len_unr; k += unroll_factor)

3093

vect80 = _mm256_loadu_si256((__m256i*)(&arr_base[k]));

3094

norm_vect80 = _mm256_sub_epi32(vect80, mask0x8);

3096

cmp_mask_ge = _mm256_or_si256(

3097

_mm256_cmpgt_epi32(norm_vect80, norm_target),

3098

_mm256_cmpeq_epi32(vect80, vect_target)

3100  mask

= _mm256_movemask_epi8(cmp_mask_ge);

3103  int

bsf = bm::bsf_asm32(

mask

);

3104  return

from + k + (bsf / 4);

3108  for

(; k <

len

; ++k)

3110  if

(arr_base[k] >= target)

3148  const unsigned

unroll_factor = 8;

3149  const unsigned len

= (

size

- start);

3150  const unsigned

len_unr =

len

- (

len

% unroll_factor);

3154

__m256i maskFF = _mm256_set1_epi32(~0u);

3156

__m256i mask_tmp, mask_0;

3160  unsigned

k = 0,

mask

, w_idx;

3161  for

(; k < len_unr; k+=unroll_factor)

3163

__m256i nbitA, nwordA;

3164  const unsigned

base = start + k;

3165

__m256i* idx_ptr = (__m256i*)(idx+base);

3167

nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask);

3171

mask_tmp = _mm256_shuffle_epi32 (nwordA,

_MM_SHUFFLE

(1,1,1,1));

3172

mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);

3173  mask

= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));

3174

_mm256_store_si256((__m256i*)mword_v, nwordA);

3179

mask_tmp = _mm256_set1_epi32(blk[w_idx]);

3183

mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],

3184

blk[mword_v[5]], blk[mword_v[4]],

3185

blk[mword_v[3]], blk[mword_v[2]],

3186

blk[mword_v[1]], blk[mword_v[0]]);

3191

__m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);

3192

__m256i mask1 = _mm256_srli_epi32 (maskFF, 31);

3193

mask_0 = _mm256_sllv_epi32(mask1, shiftA);

3195

mask_tmp = _mm256_and_si256(mask_tmp, mask_0);

3196  if

(!_mm256_testz_si256(mask_tmp, mask_tmp))

3198

__m256i* target_ptr = (__m256i*)(

arr

+base);

3200

__m256i maskZ = _mm256_xor_si256(maskFF, maskFF);

3201

mask1 = _mm256_slli_epi32(mask1, bit_idx);

3202

mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ);

3203

mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF);

3204

mask_tmp = _mm256_and_si256 (mask_tmp, mask1);

3205

_mm256_storeu_si256 (target_ptr,

3206

_mm256_or_si256 (mask_tmp,

3207

_mm256_loadu_si256(target_ptr)));

3212  for

(; k <

len

; ++k)

3214  const unsigned

base = start + k;

3235  unsigned

bitval = (*block) & 1u;

3238  unsigned

bit_idx = 0;

3240  const unsigned

vCAP = 64;

3241

__m256i maskZ = _mm256_set1_epi32(0);

3243  for

(; block < block_end; block += 8)

3249

__m256i accA = _mm256_load_si256((__m256i*)block);

3250

__m256i cmpA = _mm256_cmpeq_epi8(accA, maskZ);

3251  unsigned mask

= ~_mm256_movemask_epi8(cmpA);

3257  unsigned

w64_idx = _tzcnt_u32(

mask

);

3259

bit_idx += k * vCAP;

3266  if

(!

val

||

val

== ~0ull)

3272

bitval ^= unsigned(

cmp

);

3273  unsigned long long

pcu =

reinterpret_cast<unsigned long long>

(pcurr);

3275

pcurr =

reinterpret_cast<gap_word_t

*

>

(pcu);

3283  unsigned

bits_consumed = 0;

3287  if

(bitval != (

val

& tz))

3292  BM_ASSERT

((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));

3297

tz = (unsigned)_tzcnt_u64(bitval ? ~

val

:

val

);

3300  bool cmp

= ((bits_consumed+=tz) < vCAP);

3308

bitval ^= unsigned(

cmp

);

3309

bit_idx += tz & (vCAP - bits_consumed);

3310  unsigned long long

pcu =

reinterpret_cast<unsigned long long>

(pcurr);

3312

pcurr =

reinterpret_cast<gap_word_t

*

>

(pcu);

3314  BM_ASSERT

((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));

3324  unsigned len

= (unsigned)(pcurr - dest);

3349  const

__m256i* sub_block = (__m256i*) (block + off);

3350

__m256i* t_sub_block = (__m256i*)(target_block + off);

3354  const

__m256i* xor_sub_block = (__m256i*) (xor_block + off);

3355

__m256i mA, mB, mC, mD;

3356

mA = _mm256_xor_si256(_mm256_load_si256(sub_block),

3357

_mm256_load_si256(xor_sub_block));

3358

mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),

3359

_mm256_load_si256(xor_sub_block+1));

3360

mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),

3361

_mm256_load_si256(xor_sub_block+2));

3362

mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),

3363

_mm256_load_si256(xor_sub_block+3));

3365

_mm256_store_si256(t_sub_block, mA);

3366

_mm256_store_si256(t_sub_block+1, mB);

3367

_mm256_store_si256(t_sub_block+2, mC);

3368

_mm256_store_si256(t_sub_block+3, mD);

3372

_mm256_store_si256(t_sub_block , _mm256_load_si256(sub_block));

3373

_mm256_store_si256(t_sub_block+1, _mm256_load_si256(sub_block+1));

3374

_mm256_store_si256(t_sub_block+2, _mm256_load_si256(sub_block+2));

3375

_mm256_store_si256(t_sub_block+3, _mm256_load_si256(sub_block+3));

3402  const

__m256i* sub_block = (

const

__m256i*) (xor_block + off);

3403

__m256i* t_sub_block = (__m256i*)(target_block + off);

3405

__m256i mA, mB, mC, mD;

3406

mA = _mm256_xor_si256(_mm256_load_si256(sub_block),

3407

_mm256_load_si256(t_sub_block));

3408

mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),

3409

_mm256_load_si256(t_sub_block+1));

3410

mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),

3411

_mm256_load_si256(t_sub_block+2));

3412

mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),

3413

_mm256_load_si256(t_sub_block+3));

3415

_mm256_store_si256(t_sub_block, mA);

3416

_mm256_store_si256(t_sub_block+1, mB);

3417

_mm256_store_si256(t_sub_block+2, mC);

3418

_mm256_store_si256(t_sub_block+3, mD);

3428 #pragma GCC diagnostic pop 3432 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 3433  avx2_xor_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask) 3435 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 3436  avx2_andnot_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask) 3438 #define VECT_BITCOUNT(first, last) \ 3439  avx2_bit_count((__m256i*) (first), (__m256i*) (last)) 3441 #define VECT_BITCOUNT_AND(first, last, mask) \ 3442  avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3444 #define VECT_BITCOUNT_OR(first, last, mask) \ 3445  avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3447 #define VECT_BITCOUNT_XOR(first, last, mask) \ 3448  avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3450 #define VECT_BITCOUNT_SUB(first, last, mask) \ 3451  avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 3453 #define VECT_INVERT_BLOCK(first) \ 3454  avx2_invert_block((__m256i*)first); 3456 #define VECT_AND_BLOCK(dst, src) \ 3457  avx2_and_block((__m256i*) dst, (const __m256i*) (src)) 3459 #define VECT_AND_DIGEST(dst, src) \ 3460  avx2_and_digest((__m256i*) dst, (const __m256i*) (src)) 3462 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ 3463  avx2_and_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3465 #define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \ 3466  avx2_and_or_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3468 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \ 3469  avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4)) 3471 #define VECT_AND_DIGEST_3WAY(dst, src1, src2) \ 3472  avx2_and_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3474 #define VECT_OR_BLOCK(dst, src) \ 3475  avx2_or_block((__m256i*) dst, (__m256i*) (src)) 3477 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 3478  avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3480 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \ 3481  avx2_or_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3483 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 3484  avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3486 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \ 3487  avx2_or_block_5way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2), (__m256i*) (src3), (__m256i*) (src4)) 3489 #define VECT_SUB_BLOCK(dst, src) \ 3490  avx2_sub_block((__m256i*) dst, (__m256i*) (src)) 3492 #define VECT_SUB_DIGEST(dst, src) \ 3493  avx2_sub_digest((__m256i*) dst, (const __m256i*) (src)) 3495 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \ 3496  avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3498 #define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \ 3499  avx2_sub_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4)) 3501 #define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \ 3502  avx2_sub_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2)) 3504 #define VECT_XOR_BLOCK(dst, src) \ 3505  avx2_xor_block((__m256i*) dst, (__m256i*) (src)) 3507 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \ 3508  avx2_xor_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2)) 3510 #define VECT_COPY_BLOCK(dst, src) \ 3511  avx2_copy_block((__m256i*) dst, (__m256i*) (src)) 3513 #define VECT_COPY_BLOCK_UNALIGN(dst, src) \ 3514  avx2_copy_block_unalign((__m256i*) dst, (__m256i*) (src)) 3516 #define VECT_STREAM_BLOCK(dst, src) \ 3517  avx2_stream_block((__m256i*) dst, (__m256i*) (src)) 3519 #define VECT_STREAM_BLOCK_UNALIGN(dst, src) \ 3520  avx2_stream_block_unalign((__m256i*) dst, (__m256i*) (src)) 3522 #define VECT_SET_BLOCK(dst, value) \ 3523  avx2_set_block((__m256i*) dst, (value)) 3525 #define VECT_IS_ZERO_BLOCK(dst) \ 3526  avx2_is_all_zero((__m256i*) dst) 3528 #define VECT_IS_ONE_BLOCK(dst) \ 3529  avx2_is_all_one((__m256i*) dst) 3531 #define VECT_IS_DIGEST_ZERO(start) \ 3532  avx2_is_digest_zero((__m256i*)start) 3534 #define VECT_BLOCK_SET_DIGEST(dst, val) \ 3535  avx2_block_set_digest((__m256i*)dst, val) 3537 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \ 3538  avx2_lower_bound_scan_u32(arr, target, from, to) 3540 #define VECT_SHIFT_L1(b, acc, co) \ 3541  avx2_shift_l1((__m256i*)b, acc, co) 3543 #define VECT_SHIFT_R1(b, acc, co) \ 3544  avx2_shift_r1((__m256i*)b, acc, co) 3546 #define VECT_SHIFT_R1_AND(b, co, m, digest) \ 3547  avx2_shift_r1_and((__m256i*)b, co, (__m256i*)m, digest) 3549 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \ 3550  avx2_idx_arr_block_lookup(idx, size, nb, start) 3552 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \ 3553  avx2_set_block_bits3(block, idx, start, stop) 3555 #define VECT_BLOCK_CHANGE(block, size) \ 3556  avx2_bit_block_calc_change((__m256i*)block, size) 3558 #define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \ 3559  avx2_bit_block_calc_xor_change((__m256i*)block, (__m256i*)xor_block, size, gc, bc) 3561 #define VECT_BLOCK_CHANGE_BC(block, gc, bc) \ 3562  avx2_bit_block_calc_change_bc((__m256i*)block, gc, bc) 3564 #define VECT_BIT_TO_GAP(dest, src, dest_len) \ 3565  avx2_bit_to_gap(dest, src, dest_len) 3567 #define VECT_BIT_FIND_FIRST(src1, off, pos) \ 3568  avx2_bit_find_first((__m256i*) src1, off, pos) 3570 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \ 3571  avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos) 3573 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \ 3574  avx2_bit_block_xor(t, src, src_xor, d) 3576 #define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \ 3577  avx2_bit_block_xor_2way(t, src_xor, d) 3579 #define VECT_GAP_BFIND(buf, pos, is_set) \ 3580  avx2_gap_bfind(buf, pos, is_set) 3582 #define VECT_GAP_TEST(buf, pos) \ 3583  avx2_gap_test(buf, pos) 3586 #define VECT_BIT_COUNT_DIGEST(blk, d) \ 3587  avx2_bit_block_count(blk, d)

ncbi::TMaskedQueryRegions mask

#define BM_AVX2_POPCNT_PROLOG

#define BM_CSA256(h, l, a, b, c)

#define BM_AVX2_BIT_COUNT(ret, v)

Bit manipulation primitives (internal)

static vector< string > arr

void avx2_copy_block(__m256i *dst, const __m256i *src)

AVX2 block copy dst = *src.

unsigned avx2_and_block(__m256i *dst, const __m256i *src)

AND array elements against another array dst &= *src.

void avx2_xor_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)

XOR array elements to specified mask dst = *src ^ mask.

bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)

check if 2 wave of pointers are all NULL

void avx2_bit_block_calc_change_bc(const __m256i *block, unsigned *gcount, unsigned *bcount)

unsigned avx2_gap_test(const unsigned short *buf, unsigned pos)

Hybrid binary search, starts as binary, then switches to scan.

bool avx2_is_all_one(const __m256i *block)

check if block is all one bits

bool avx2_and_or_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)

AND-OR block digest stride 2 way dst |= *src1 & *src2.

bool avx2_or_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)

OR array elements against another unaligned array dst |= *src.

bool avx2_or_block_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)

OR array elements against another 2 arrays dst |= *src1 | src2.

bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)

check if 2 wave of pointers are all the same (NULL or FULL)

unsigned avx2_and_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)

AND array elements against another array (unaligned) dst &= *src.

bool avx2_and_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)

AND block digest stride 2 way dst = *src1 & *src2.

bool avx2_or_block(__m256i *dst, const __m256i *src)

OR array elements against another array dst |= *src.

unsigned avx2_xor_block(__m256i *dst, const __m256i *src)

XOR block against another dst ^= *src.

bool avx2_sub_digest(__m256i *dst, const __m256i *src)

SUB (AND NOT) block digest stride dst &= ~*src.

bm::id_t avx2_bit_count_sub(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

AND NOT bit count for two aligned bit-blocks.

unsigned avx2_gap_bfind(const unsigned short *buf, unsigned pos, unsigned *is_set)

Hybrid binary search, starts as binary, then switches to scan.

unsigned avx2_bit_to_gap(gap_word_t *dest, const unsigned *block, unsigned dest_len)

Convert bit block to GAP block.

bool avx2_sub_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)

SUB block digest stride.

bool avx2_shift_r1(__m256i *block, bm::word_t *empty_acc, unsigned co1)

block shift right by 1

bool avx2_test_all_zero_wave(const void *ptr)

check if wave of pointers is all NULL

unsigned avx2_bit_block_calc_change(const __m256i *block, unsigned size)

bool avx2_sub_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)

2-operand SUB (AND NOT) block digest stride dst = *src1 & ~*src2

bool avx2_sub_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)

SUB block digest stride.

void avx2_andnot_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)

Inverts array elements and NOT them to specified mask dst = ~*src & mask.

void avx2_block_set_digest(__m256i *dst, unsigned value)

set digest stride to 0xFF.. or 0x0 value

bm::id_t avx2_bit_count_xor(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

XOR bit count for two aligned bit-blocks.

bm::id_t avx2_bit_count(const __m256i *block, const __m256i *block_end)

AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...

bm::id_t avx2_bit_count_and(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

AND bit count for two aligned bit-blocks.

void avx2_stream_block(__m256i *dst, const __m256i *src)

AVX2 block copy dst = *src.

void avx2_copy_block_unalign(__m256i *dst, const __m256i *src)

AVX2 block copy (unaligned SRC) dst = *src.

void avx2_invert_block(__m256i *dst)

Invert bit-block dst = ~*dst or dst ^= *dst.

void avx2_bit_block_calc_xor_change(const __m256i *block, const __m256i *xor_block, unsigned size, unsigned *gcount, unsigned *bcount)

bool avx2_shift_r1_and(__m256i *block, bm::word_t co1, const __m256i *mask_block, bm::id64_t *digest)

fused block shift right by 1 plus AND

bool avx2_bit_find_first_diff(const __m256i *block1, const __m256i *block2, unsigned *pos)

Find first bit which is different between two bit-blocks.

bm::id_t avx2_bit_block_count(const bm::word_t *const block, bm::id64_t digest)

Calculate population count based on digest.

bool avx2_and_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)

AND block digest stride.

bool avx2_and_digest(__m256i *dst, const __m256i *src)

AND block digest stride dst &= *src.

unsigned avx2_sub_block(__m256i *dst, const __m256i *src)

AND-NOT (SUB) array elements against another array dst &= ~*src.

void avx2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)

Build partial XOR product of 2 bit-blocks using digest mask.

void avx2_set_block(__m256i *dst, bm::word_t value)

AVX2 block memset dst = value.

bool avx2_test_all_one_wave(const void *ptr)

check if wave of pointers is all 0xFFF

void avx2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) noexcept

Build partial XOR product of 2 bit-blocks using digest mask.

bool avx2_or_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)

OR 2 arrays and copy to the destination dst = *src1 | src2.

unsigned avx2_xor_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)

3 operand XOR dst = *src1 ^ src2

bool avx2_is_digest_zero(const __m256i *block)

check if digest stride is all zero bits

bool avx2_bit_find_first(const __m256i *block, unsigned off, unsigned *pos)

Find first bit set.

void avx2_stream_block_unalign(__m256i *dst, const __m256i *src)

AVX2 block copy (unaligned SRC) dst = *src.

bool avx2_shift_l1(__m256i *block, bm::word_t *empty_acc, unsigned co1)

block shift left by 1

int avx2_cmpge_u16(__m256i vect16, unsigned short value)

Experimental (test) function to do SIMD vector search in sorted, growing array.

int avx2_cmpge_u32(__m256i vect8, unsigned value)

Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.

bool avx2_or_block_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)

OR array elements against another 4 arrays dst |= *src1 | src2.

bool avx2_is_all_zero(const __m256i *block)

check if block is all zero bits

bool avx2_and_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)

AND block digest stride.

unsigned avx2_lower_bound_scan_u32(const unsigned *arr, unsigned target, unsigned from, unsigned to)

lower bound (great or equal) linear scan in ascending order sorted array

static void hex(unsigned char c)

const unsigned set_block_digest_wave_size

unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)

__m256i avx2_setbit_to256(unsigned i)

Experiemntal.

const unsigned set_block_mask

const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)

bm::id_t avx2_bit_count_or(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

void avx2_set_block_bits2(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)

Experimental code to set bits via AVX strides.

unsigned long long bmi_bslr_u64(unsigned long long w) noexcept

void avx2_set_block_bits(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)

void avx2_set_block_bits3(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)

Experimental code to set bits via AVX strides.

const unsigned set_word_shift

const unsigned set_block_size

unsigned long long int id64_t

const unsigned block_waves

__m256i avx2_setbit_256(__m256i target, __m256i source)

Set a bits in an AVX target, by indexes (int4) from the source.

unsigned short gap_word_t

void avx2_bit_block_gather_scatter(unsigned *arr, const unsigned *blk, const unsigned *idx, unsigned size, unsigned start, unsigned bit_idx)

const unsigned gap_max_bits

const unsigned set_block_shift

const unsigned set_word_mask

unsigned long long bmi_blsi_u64(unsigned long long w)

const struct ncbi::grid::netcache::search::fields::SIZE size

const GenericPointer< typename T::ValueType > T2 value

const CharType(& source)[N]

double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)

static int _mm_cvtsi128_si32(__m128i a)

static __m128i _mm_add_epi16(__m128i a, __m128i b)

static void _mm_prefetch(const void *p, int i)

static int64_t _mm_popcnt_u64(uint64_t a)

#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)

MACRO for shuffle parameter for _mm_shuffle_ps().


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4