A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/bmavx512_8h_source.html below:

NCBI C++ ToolKit: include/util/bitset/bmavx512.h Source File

1 #ifndef BMAVX512__H__INCLUDED__ 2 #define BMAVX512__H__INCLUDED__ 95 #pragma GCC diagnostic push 96 #pragma GCC diagnostic ignored "-Wconversion" 107  const

__mmask16 m16F = __mmask16(~0u);

108

__mmask16 eq_m = _mm512_cmpeq_epi32_mask(m, _mm512_set1_epi64(0ull));

109  return

(eq_m == m16F);

119  const

__mmask16 m16F = __mmask16(~0u);

120

__mmask16 eq_m = _mm512_cmpeq_epi32_mask(m, _mm512_set1_epi64(-1));

121  return

(eq_m == m16F);

126 #define BM_CSA256(h, l, a, b, c) \ 128  __m256i u = _mm256_xor_si256(a, b); \ 129  h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \ 130  l = _mm256_xor_si256(u, c); \ 133 #define BM_AVX2_BIT_COUNT(ret, v) \ 135  __m256i lo = _mm256_and_si256(v, low_mask); \ 136  __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \ 137  __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \ 138  __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \ 139  ret = _mm256_sad_epu8(cnt1, cnt2); \ 142 #define BM_AVX2_DECL_LOOKUP1 \ 143  __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \ 144  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8); 145 #define BM_AVX2_DECL_LOOKUP2 \ 146 __m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \ 147  4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); 149 #define BM_AVX2_POPCNT_PROLOG \ 150  BM_AVX2_DECL_LOOKUP1 \ 151  BM_AVX2_DECL_LOOKUP2 \ 152  __m256i low_mask = _mm256_set1_epi8(0x0f); \ 168

__m256i

cnt

= _mm256_setzero_si256();

169

__m256i ones = _mm256_setzero_si256();

170

__m256i twos = _mm256_setzero_si256();

171

__m256i fours = _mm256_setzero_si256();

172

__m256i eights = _mm256_setzero_si256();

173

__m256i sixteens = _mm256_setzero_si256();

174

__m256i twosA, twosB, foursA, foursB, eightsA, eightsB;

182  b

= _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);

185  b

= _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);

187  BM_CSA256

(foursA, twos, twos, twosA, twosB);

189  b

= _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);

192  b

= _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);

194  BM_CSA256

(foursB, twos, twos, twosA, twosB);

195  BM_CSA256

(eightsA, fours, fours, foursA, foursB);

197  b

= _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);

200  b

= _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);

202  BM_CSA256

(foursA, twos, twos, twosA, twosB);

204  b

= _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);

207  b

= _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);

209  BM_CSA256

(foursB, twos, twos, twosA, twosB);

210  BM_CSA256

(eightsB, fours, fours, foursA, foursB);

211  BM_CSA256

(sixteens, eights, eights, eightsA, eightsB);

214  cnt

= _mm256_add_epi64(

cnt

, bc);

217

}

while

(block < block_end);

219  cnt

= _mm256_slli_epi64(

cnt

, 4);

221  cnt

= _mm256_add_epi64(

cnt

, _mm256_slli_epi64(bc, 3));

223  cnt

= _mm256_add_epi64(

cnt

, _mm256_slli_epi64(bc, 2));

225  cnt

= _mm256_add_epi64(

cnt

, _mm256_slli_epi64(bc, 1));

227  cnt

= _mm256_add_epi64(

cnt

, bc);

231  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

245

__m256i

cnt

= _mm256_setzero_si256();

251

ymm0 = _mm256_load_si256(block);

252

ymm1 = _mm256_load_si256(mask_block);

253

ymm0 = _mm256_and_si256(ymm0, ymm1);

254

++block; ++mask_block;

256  cnt

= _mm256_add_epi64(

cnt

, bc);

258

ymm0 = _mm256_load_si256(block);

259

ymm1 = _mm256_load_si256(mask_block);

260

ymm0 = _mm256_and_si256(ymm0, ymm1);

261

++block; ++mask_block;

263  cnt

= _mm256_add_epi64(

cnt

, bc);

265

ymm0 = _mm256_load_si256(block);

266

ymm1 = _mm256_load_si256(mask_block);

267

ymm0 = _mm256_and_si256(ymm0, ymm1);

268

++block; ++mask_block;

270  cnt

= _mm256_add_epi64(

cnt

, bc);

272

ymm0 = _mm256_load_si256(block);

273

ymm1 = _mm256_load_si256(mask_block);

274

ymm0 = _mm256_and_si256(ymm0, ymm1);

275

++block; ++mask_block;

277  cnt

= _mm256_add_epi64(

cnt

, bc);

279

}

while

(block < block_end);

282  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

292

__m256i

cnt

= _mm256_setzero_si256();

295

__m256i

tmp0

= _mm256_load_si256(block);

296

__m256i

tmp1

= _mm256_load_si256(mask_block);

301  cnt

= _mm256_add_epi64(

cnt

, bc);

303

++block; ++mask_block;

305

}

while

(block < block_end);

308  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

323

__m256i

cnt

= _mm256_setzero_si256();

324

__m256i mA, mB, mC, mD;

327

mA = _mm256_xor_si256(_mm256_load_si256(block+0),

328

_mm256_load_si256(mask_block+0));

330  cnt

= _mm256_add_epi64(

cnt

, bc);

332

mB = _mm256_xor_si256(_mm256_load_si256(block+1),

333

_mm256_load_si256(mask_block+1));

335  cnt

= _mm256_add_epi64(

cnt

, bc);

337

mC = _mm256_xor_si256(_mm256_load_si256(block+2),

338

_mm256_load_si256(mask_block+2));

340  cnt

= _mm256_add_epi64(

cnt

, bc);

342

mD = _mm256_xor_si256(_mm256_load_si256(block+3),

343

_mm256_load_si256(mask_block+3));

345  cnt

= _mm256_add_epi64(

cnt

, bc);

347

block += 4; mask_block += 4;

349

}

while

(block < block_end);

352  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

368

__m256i

cnt

= _mm256_setzero_si256();

371

__m256i

tmp0

= _mm256_load_si256(block);

372

__m256i

tmp1

= _mm256_load_si256(mask_block);

377  cnt

= _mm256_add_epi64(

cnt

, bc);

379

++block; ++mask_block;

381

}

while

(block < block_end);

384  return

(

unsigned

)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);

401

__m512i yM = _mm512_set1_epi32(

int

(

mask

));

404

_mm512_store_si512(dst+0, _mm512_xor_si512(_mm512_load_si512(src+0), yM));

405

_mm512_store_si512(dst+1, _mm512_xor_si512(_mm512_load_si512(src+1), yM));

406

_mm512_store_si512(dst+2, _mm512_xor_si512(_mm512_load_si512(src+2), yM));

407

_mm512_store_si512(dst+3, _mm512_xor_si512(_mm512_load_si512(src+3), yM));

410

}

while

(src < src_end);

426

__m512i yM = _mm512_set1_epi32(

int

(

mask

));

429

_mm512_store_si512(dst+0, _mm512_andnot_si512(_mm512_load_si512(src+0), yM));

430

_mm512_store_si512(dst+1, _mm512_andnot_si512(_mm512_load_si512(src+1), yM));

431

_mm512_store_si512(dst+2, _mm512_andnot_si512(_mm512_load_si512(src+2), yM));

432

_mm512_store_si512(dst+3, _mm512_andnot_si512(_mm512_load_si512(src+3), yM));

435

}

while

(src < src_end);

448

__m512i m1A, m1B, m1C, m1D;

449

__m512i accA, accB, accC, accD;

454

accA = accB = accC = accD = _mm512_setzero_si512();

458

m1A = _mm512_and_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));

459

m1B = _mm512_and_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));

460

m1C = _mm512_and_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));

461

m1D = _mm512_and_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));

463

_mm512_store_si512(dst+0, m1A);

464

_mm512_store_si512(dst+1, m1B);

465

_mm512_store_si512(dst+2, m1C);

466

_mm512_store_si512(dst+3, m1D);

468

accA = _mm512_or_si512(accA, m1A);

469

accB = _mm512_or_si512(accB, m1B);

470

accC = _mm512_or_si512(accC, m1C);

471

accD = _mm512_or_si512(accD, m1D);

475

}

while

(src < src_end);

477

accA = _mm512_or_si512(accA, accB);

478

accC = _mm512_or_si512(accC, accD);

479

accA = _mm512_or_si512(accA, accC);

497

m1A = _mm512_and_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));

498

m1B = _mm512_and_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));

500

_mm512_store_si512(dst+0, m1A);

501

_mm512_store_si512(dst+1, m1B);

503

m1A = _mm512_or_si512(m1A, m1B);

522

m1A = _mm512_and_si512(_mm512_load_si512(src1+0), _mm512_load_si512(src2+0));

523

m1B = _mm512_and_si512(_mm512_load_si512(src1+1), _mm512_load_si512(src2+1));

525

_mm512_store_si512(dst+0, m1A);

526

_mm512_store_si512(dst+1, m1B);

528

m1A = _mm512_or_si512(m1A, m1B);

546

__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;

547

__m256i accA, accB, accC, accD;

549

accA = _mm256_setzero_si256();

550

accB = _mm256_setzero_si256();

551

accC = _mm256_setzero_si256();

552

accD = _mm256_setzero_si256();

556

m1A = _mm256_loadu_si256(src+0);

557

m2A = _mm256_load_si256(dst+0);

558

m1A = _mm256_and_si256(m1A, m2A);

559

_mm256_store_si256(dst+0, m1A);

560

accA = _mm256_or_si256(accA, m1A);

562

m1B = _mm256_loadu_si256(src+1);

563

m2B = _mm256_load_si256(dst+1);

564

m1B = _mm256_and_si256(m1B, m2B);

565

_mm256_store_si256(dst+1, m1B);

566

accB = _mm256_or_si256(accB, m1B);

568

m1C = _mm256_loadu_si256(src+2);

569

m2C = _mm256_load_si256(dst+2);

570

m1C = _mm256_and_si256(m1C, m2C);

571

_mm256_store_si256(dst+2, m1C);

572

accC = _mm256_or_si256(accC, m1C);

574

m1D = _mm256_loadu_si256(src+3);

575

m2D = _mm256_load_si256(dst+3);

576

m1D = _mm256_and_si256(m1D, m2D);

577

_mm256_store_si256(dst+3, m1D);

578

accD = _mm256_or_si256(accD, m1D);

582

}

while

(src < src_end);

584

accA = _mm256_or_si256(accA, accB);

585

accC = _mm256_or_si256(accC, accD);

586

accA = _mm256_or_si256(accA, accC);

588  return

!_mm256_testz_si256(accA, accA);

603

__m512i m1A, m1B, m1C, m1D;

605

__m512i mAccF0, mAccF1;

606

mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);

616

m1A = _mm512_or_si512(_mm512_load_si512(src), _mm512_load_si512(dst));

617

m1B = _mm512_or_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));

618

mAccF0 = _mm512_and_si512(mAccF0, m1A);

619

mAccF0 = _mm512_and_si512(mAccF0, m1B);

621

_mm512_stream_si512(dst, m1A);

622

_mm512_stream_si512(dst+1, m1B);

626

m1C = _mm512_or_si512(_mm512_load_si512(src2), _mm512_load_si512(dst2));

627

m1D = _mm512_or_si512(_mm512_load_si512(src2+1), _mm512_load_si512(dst2+1));

628

mAccF1 = _mm512_and_si512(mAccF1, m1C);

629

mAccF1 = _mm512_and_si512(mAccF1, m1D);

631

_mm512_stream_si512(dst2, m1C);

632

_mm512_stream_si512(dst2+1, m1D);

634

src2 += 2; dst2 += 2;

635

}

while

(src2 < src_end);

637

mAccF0 = _mm512_and_si512(mAccF0, mAccF1);

655

__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;

656

__m256i mAccF0 = _mm256_set1_epi32(~0u);

657

__m256i mAccF1 = _mm256_set1_epi32(~0u);

660

m1A = _mm256_loadu_si256(src+0);

661

m2A = _mm256_load_si256(dst+0);

662

m1A = _mm256_or_si256(m1A, m2A);

663

_mm256_store_si256(dst+0, m1A);

665

m1B = _mm256_loadu_si256(src+1);

666

m2B = _mm256_load_si256(dst+1);

667

m1B = _mm256_or_si256(m1B, m2B);

668

_mm256_store_si256(dst+1, m1B);

670

m1C = _mm256_loadu_si256(src+2);

671

m2C = _mm256_load_si256(dst+2);

672

m1C = _mm256_or_si256(m1C, m2C);

673

_mm256_store_si256(dst+2, m1C);

675

m1D = _mm256_loadu_si256(src+3);

676

m2D = _mm256_load_si256(dst+3);

677

m1D = _mm256_or_si256(m1D, m2D);

678

_mm256_store_si256(dst+3, m1D);

680

mAccF1 = _mm256_and_si256(mAccF1, m1C);

681

mAccF1 = _mm256_and_si256(mAccF1, m1D);

682

mAccF0 = _mm256_and_si256(mAccF0, m1A);

683

mAccF0 = _mm256_and_si256(mAccF0, m1B);

687

}

while

(src < src_end);

689

__m256i maskF = _mm256_set1_epi32(~0u);

690

mAccF0 = _mm256_and_si256(mAccF0, mAccF1);

691

__m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);

692  unsigned

maskA = unsigned(_mm256_movemask_epi8(wcmpA));

693  return

(maskA == ~0u);

708

__m512i m1A, m1B, m1C, m1D;

709

__m512i mAccF0, mAccF1;

711

mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);

717

m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(src2+0));

718

m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(src2+1));

719

m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(src2+2));

720

m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(src2+3));

722

_mm512_store_si512(dst+0, m1A);

723

_mm512_store_si512(dst+1, m1B);

724

_mm512_store_si512(dst+2, m1C);

725

_mm512_store_si512(dst+3, m1D);

727

mAccF1 = _mm512_and_si512(mAccF1, m1C);

728

mAccF1 = _mm512_and_si512(mAccF1, m1D);

729

mAccF0 = _mm512_and_si512(mAccF0, m1A);

730

mAccF0 = _mm512_and_si512(mAccF0, m1B);

732

src1 += 4; src2 += 4; dst += 4;

734

}

while

(src1 < src_end1);

736

mAccF0 = _mm512_and_si512(mAccF0, mAccF1);

753

__m512i m1A, m1B, m1C, m1D;

754

__m512i mAccF0, mAccF1;

756

mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);

762

m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(dst+0));

763

m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(dst+1));

764

m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(dst+2));

765

m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(dst+3));

767

m1A = _mm512_or_si512(m1A, _mm512_load_si512(src2+0));

768

m1B = _mm512_or_si512(m1B, _mm512_load_si512(src2+1));

769

m1C = _mm512_or_si512(m1C, _mm512_load_si512(src2+2));

770

m1D = _mm512_or_si512(m1D, _mm512_load_si512(src2+3));

772

_mm512_store_si512(dst+0, m1A);

773

_mm512_store_si512(dst+1, m1B);

774

_mm512_store_si512(dst+2, m1C);

775

_mm512_store_si512(dst+3, m1D);

777

mAccF1 = _mm512_and_si512(mAccF1, m1C);

778

mAccF1 = _mm512_and_si512(mAccF1, m1D);

779

mAccF0 = _mm512_and_si512(mAccF0, m1A);

780

mAccF0 = _mm512_and_si512(mAccF0, m1B);

782

src1 += 4; src2 += 4; dst += 4;

784

}

while

(src1 < src_end1);

786

mAccF0 = _mm512_and_si512(mAccF0, mAccF1);

805

__m512i m1A, m1B, m1C, m1D;

806

__m512i mAccF0, mAccF1;

807

mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);

814

m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(dst+0));

815

m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(dst+1));

816

m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(dst+2));

817

m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(dst+3));

819

m1A = _mm512_or_si512(m1A, _mm512_load_si512(src2+0));

820

m1B = _mm512_or_si512(m1B, _mm512_load_si512(src2+1));

821

m1C = _mm512_or_si512(m1C, _mm512_load_si512(src2+2));

822

m1D = _mm512_or_si512(m1D, _mm512_load_si512(src2+3));

824

m1A = _mm512_or_si512(m1A, _mm512_load_si512(src3+0));

825

m1B = _mm512_or_si512(m1B, _mm512_load_si512(src3+1));

826

m1C = _mm512_or_si512(m1C, _mm512_load_si512(src3+2));

827

m1D = _mm512_or_si512(m1D, _mm512_load_si512(src3+3));

829

m1A = _mm512_or_si512(m1A, _mm512_load_si512(src4+0));

830

m1B = _mm512_or_si512(m1B, _mm512_load_si512(src4+1));

831

m1C = _mm512_or_si512(m1C, _mm512_load_si512(src4+2));

832

m1D = _mm512_or_si512(m1D, _mm512_load_si512(src4+3));

834

_mm512_store_si512(dst+0, m1A);

835

_mm512_store_si512(dst+1, m1B);

836

_mm512_store_si512(dst+2, m1C);

837

_mm512_store_si512(dst+3, m1D);

839

mAccF1 = _mm512_and_si512(mAccF1, m1C);

840

mAccF1 = _mm512_and_si512(mAccF1, m1D);

841

mAccF0 = _mm512_and_si512(mAccF0, m1A);

842

mAccF0 = _mm512_and_si512(mAccF0, m1B);

844

src1 += 4; src2 += 4;

845

src3 += 4; src4 += 4;

851

}

while

(src1 < src_end1);

853

mAccF0 = _mm512_and_si512(mAccF0, mAccF1);

868

__m512i m1A, m1B, m1C, m1D;

869

__m512i accA, accB, accC, accD;

874

accA = accB = accC = accD = _mm512_setzero_si512();

878

m1A = _mm512_xor_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));

879

m1B = _mm512_xor_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));

880

m1C = _mm512_xor_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));

881

m1D = _mm512_xor_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));

883

_mm512_store_si512(dst+0, m1A);

884

_mm512_store_si512(dst+1, m1B);

885

_mm512_store_si512(dst+2, m1C);

886

_mm512_store_si512(dst+3, m1D);

888

accA = _mm512_or_si512(accA, m1A);

889

accB = _mm512_or_si512(accB, m1B);

890

accC = _mm512_or_si512(accC, m1C);

891

accD = _mm512_or_si512(accD, m1D);

895

}

while

(src < src_end);

897

accA = _mm512_or_si512(accA, accB);

898

accC = _mm512_or_si512(accC, accD);

899

accA = _mm512_or_si512(accA, accC);

915

__m512i m1A, m1B, m1C, m1D;

916

__m512i accA, accB, accC, accD;

921

accA = accB = accC = accD = _mm512_setzero_si512();

925

m1A = _mm512_xor_si512(_mm512_load_si512(src1 + 0), _mm512_load_si512(src2 + 0));

926

m1B = _mm512_xor_si512(_mm512_load_si512(src1 + 1), _mm512_load_si512(src2 + 1));

927

m1C = _mm512_xor_si512(_mm512_load_si512(src1 + 2), _mm512_load_si512(src2 + 2));

928

m1D = _mm512_xor_si512(_mm512_load_si512(src1 + 3), _mm512_load_si512(src2 + 3));

930

_mm512_store_si512(dst + 0, m1A);

931

_mm512_store_si512(dst + 1, m1B);

932

_mm512_store_si512(dst + 2, m1C);

933

_mm512_store_si512(dst + 3, m1D);

935

accA = _mm512_or_si512(accA, m1A);

936

accB = _mm512_or_si512(accB, m1B);

937

accC = _mm512_or_si512(accC, m1C);

938

accD = _mm512_or_si512(accD, m1D);

940

src1 += 4; src2 += 4; dst += 4;

942

}

while

(src1 < src1_end);

944

accA = _mm512_or_si512(accA, accB);

945

accC = _mm512_or_si512(accC, accD);

946

accA = _mm512_or_si512(accA, accC);

964

__m512i m1A, m1B, m1C, m1D;

965

__m512i accA, accB, accC, accD;

967

accA = accB = accC = accD = _mm512_setzero_si512();

974

m1A = _mm512_andnot_si512(_mm512_load_si512(src), _mm512_load_si512(dst));

975

m1B = _mm512_andnot_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));

976

m1C = _mm512_andnot_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));

977

m1D = _mm512_andnot_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));

979

_mm512_store_si512(dst+0, m1A);

980

_mm512_store_si512(dst+1, m1B);

981

_mm512_store_si512(dst+2, m1C);

982

_mm512_store_si512(dst+3, m1D);

984

accA = _mm512_or_si512(accA, m1A);

985

accB = _mm512_or_si512(accB, m1B);

986

accC = _mm512_or_si512(accC, m1C);

987

accD = _mm512_or_si512(accD, m1D);

991

}

while

(src < src_end);

993

accA = _mm512_or_si512(accA, accB);

994

accC = _mm512_or_si512(accC, accD);

995

accA = _mm512_or_si512(accA, accC);

1013

m1A = _mm512_andnot_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));

1014

m1B = _mm512_andnot_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));

1016

_mm512_store_si512(dst+0, m1A);

1017

_mm512_store_si512(dst+1, m1B);

1019

m1A = _mm512_or_si512(m1A, m1B);

1037

__m512i zmm0 = _mm512_set1_epi32(

int

(

value

));

1040

_mm512_store_si512(dst, zmm0);

1041

_mm512_store_si512(dst+1, zmm0);

1042

_mm512_store_si512(dst+2, zmm0);

1043

_mm512_store_si512(dst+3, zmm0);

1046

}

while

(dst < dst_end);

1061

__m512i ymm0, ymm1, ymm2, ymm3;

1068

ymm0 = _mm512_load_si512(src+0);

1069

ymm1 = _mm512_load_si512(src+1);

1070

ymm2 = _mm512_load_si512(src+2);

1071

ymm3 = _mm512_load_si512(src+3);

1073

_mm512_store_si512(dst+0, ymm0);

1074

_mm512_store_si512(dst+1, ymm1);

1075

_mm512_store_si512(dst+2, ymm2);

1076

_mm512_store_si512(dst+3, ymm3);

1080

}

while

(src < src_end);

1094

__m512i maskFF = _mm512_set1_epi64(-1);

1101

ymm0 = _mm512_xor_si512(_mm512_load_si512(dst+0), maskFF);

1102

ymm1 = _mm512_xor_si512(_mm512_load_si512(dst+1), maskFF);

1104

_mm512_store_si512(dst+0, ymm0);

1105

_mm512_store_si512(dst+1, ymm1);

1107

ymm0 = _mm512_xor_si512(_mm512_load_si512(dst+2), maskFF);

1108

ymm1 = _mm512_xor_si512(_mm512_load_si512(dst+3), maskFF);

1110

_mm512_store_si512(dst+2, ymm0);

1111

_mm512_store_si512(dst+3, ymm1);

1115

}

while

(dst < dst_end);

1129

__m512i w0 = _mm512_load_si512(block+0);

1130

__m512i w1 = _mm512_load_si512(block+1);

1132

__m512i wA = _mm512_or_si512(w0, w1);

1134

__m512i w2 = _mm512_load_si512(block+2);

1135

__m512i w3 = _mm512_load_si512(block+3);

1137

__m512i wB = _mm512_or_si512(w2, w3);

1138

wA = _mm512_or_si512(wA, wB);

1145

}

while

(block < block_end);

1157

_mm512_or_si512(_mm512_load_si512(block+0),

1158

_mm512_load_si512(block+1));

1171  const

__mmask16 m16F = __mmask16(~0u);

1173

__m512i maskF = _mm512_set1_epi64(-1);

1179

__mmask16 eq_m = _mm512_cmpeq_epi32_mask(_mm512_load_si512(block), maskF);

1183

eq_m = _mm512_cmpeq_epi32_mask(_mm512_load_si512(block+1), maskF);

1188

}

while

(block < block_end);

1199

__m256i w0 = _mm256_loadu_si256((__m256i*)ptr);

1200  return

_mm256_testz_si256(w0, w0);

1211

__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);

1212

__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);

1213

w0 = _mm256_or_si256(w0, w1);

1214  return

_mm256_testz_si256(w0, w0);

1224

__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);

1225

__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);

1226

w0 = _mm256_xor_si256(w0, w1);

1227  return

_mm256_testz_si256(w0, w0);

1242  unsigned count

= (unsigned)(block_end - block)*4;

1245  const int

w_shift =

sizeof

(w0) * 8 - 1;

1246  bool

first_word =

true

;

1257  count

-= (w_prev = (w0 >> w_shift));

1271

first_word =

false

;

1280  count

-= !(w_prev ^ (w0 & 1));

1285  count

-= !w_prev; w_prev ^= w_prev;

1293  count

-= !(w_prev ^ (w0 & 1));

1298  count

-= !w_prev; w_prev ^= w_prev;

1305  count

-= !(w_prev ^ (w0 & 1));

1310  count

-= !w_prev; w_prev ^= w_prev;

1317  count

-= !(w_prev ^ (w0 & 1));

1322  count

-= !w_prev; w_prev ^= w_prev;

1325

}

while

(++block < block_end);

1342  unsigned

avx_vect_waves,

1345

__m256i xcnt = _mm256_setzero_si256();

1350  for

(

unsigned i

= 0;

i

< avx_vect_waves; ++

i

)

1352

__m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));

1353

__m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));

1354

__m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);

1355

xcnt = _mm256_add_epi16(xcnt, ymm_s2);

1360

xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);

1365

xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);

1366

xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);

1367  __m128i

xcnt2 =

_mm_add_epi16

(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));

1381  unsigned

nb,

unsigned

start)

1383  const unsigned

unroll_factor = 16;

1384  const unsigned len

= (

size

- start);

1385  const unsigned

len_unr =

len

- (

len

% unroll_factor);

1390

__m256i nbM = _mm256_set1_epi32(

int

(nb));

1392  for

(k = 0; k < len_unr; k+=unroll_factor)

1394

__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));

1397

__m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);

1398  if

(~0u !=

unsigned

(_mm256_movemask_epi8(wcmpA)))

1400

__m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));

1403

__m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);

1404  if

(~0u !=

unsigned

(_mm256_movemask_epi8(wcmpB)))

1407  for

(; k <

len

; ++k)

1448  const unsigned

unroll_factor = 8;

1449  const unsigned len

= (

size

- start);

1450  const unsigned

len_unr =

len

- (

len

% unroll_factor);

1454

__m256i maskFF = _mm256_set1_epi32(~0u);

1456

__m256i mask_tmp, mask_0;

1460  unsigned

k = 0,

mask

, w_idx;

1461  for

(; k < len_unr; k+=unroll_factor)

1463

__m256i nbitA, nwordA;

1464  const unsigned

base = start + k;

1465

__m256i* idx_ptr = (__m256i*)(idx+base);

1467

nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask);

1471

mask_tmp = _mm256_shuffle_epi32 (nwordA,

_MM_SHUFFLE

(1,1,1,1));

1472

mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);

1473  mask

= _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));

1474

_mm256_store_si256((__m256i*)mword_v, nwordA);

1479

mask_tmp = _mm256_set1_epi32(blk[w_idx]);

1483

mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],

1484

blk[mword_v[5]], blk[mword_v[4]],

1485

blk[mword_v[3]], blk[mword_v[2]],

1486

blk[mword_v[1]], blk[mword_v[0]]);

1491

__m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);

1492

__m256i mask1 = _mm256_srli_epi32 (maskFF, 31);

1493

mask_0 = _mm256_sllv_epi32(mask1, shiftA);

1495

mask_tmp = _mm256_and_si256(mask_tmp, mask_0);

1496  if

(!_mm256_testz_si256(mask_tmp, mask_tmp))

1498

__m256i* target_ptr = (__m256i*)(

arr

+base);

1500

__m256i maskZ = _mm256_xor_si256(maskFF, maskFF);

1501

mask1 = _mm256_slli_epi32(mask1, bit_idx);

1502

mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ);

1503

mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF);

1504

mask_tmp = _mm256_and_si256 (mask_tmp, mask1);

1505

_mm256_storeu_si256 (target_ptr,

1506

_mm256_or_si256 (mask_tmp,

1507

_mm256_loadu_si256(target_ptr)));

1512  for

(; k <

len

; ++k)

1514  const unsigned

base = start + k;

1522 #pragma GCC diagnostic pop 1526 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 1527  avx512_xor_arr_2_mask((__m512i*)(dst), (__m512i*)(src), (__m512i*)(src_end), (bm::word_t)mask) 1529 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 1530  avx512_andnot_arr_2_mask((__m512i*)(dst), (__m512i*)(src), (__m512i*)(src_end), (bm::word_t)mask) 1532 #define VECT_BITCOUNT(first, last) \ 1533  avx2_bit_count((__m256i*) (first), (__m256i*) (last)) 1535 #define VECT_BITCOUNT_AND(first, last, mask) \ 1536  avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1538 #define VECT_BITCOUNT_OR(first, last, mask) \ 1539  avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1541 #define VECT_BITCOUNT_XOR(first, last, mask) \ 1542  avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1544 #define VECT_BITCOUNT_SUB(first, last, mask) \ 1545  avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask)) 1547 #define VECT_INVERT_BLOCK(first) \ 1548  avx512_invert_block((__m512i*)first); 1550 #define VECT_AND_BLOCK(dst, src) \ 1551  avx512_and_block((__m512i*) dst, (const __m512i*) (src)) 1553 #define VECT_AND_DIGEST(dst, src) \ 1554  avx512_and_digest((__m512i*) dst, (const __m512i*) (src)) 1556 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \ 1557  avx512_and_digest_2way((__m512i*) dst, (const __m512i*) (src1), (const __m512i*) (src2)) 1559 #define VECT_OR_BLOCK(dst, src) \ 1560  avx512_or_block((__m512i*) dst, (__m512i*) (src)) 1562 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \ 1563  avx512_or_block_2way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2)) 1565 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \ 1566  avx512_or_block_3way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2)) 1568 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \ 1569  avx512_or_block_5way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2), (__m512i*) (src3), (__m512i*) (src4)) 1571 #define VECT_SUB_BLOCK(dst, src) \ 1572  avx512_sub_block((__m512i*) dst, (__m512i*) (src)) 1574 #define VECT_SUB_DIGEST(dst, src) \ 1575  avx512_sub_digest((__m512i*) dst, (const __m512i*) (src)) 1577 #define VECT_XOR_BLOCK(dst, src) \ 1578  avx512_xor_block((__m512i*) dst, (__m512i*) (src)) 1580 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \ 1581  avx512_xor_block_2way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2)) 1583 #define VECT_COPY_BLOCK(dst, src) \ 1584  avx512_copy_block((__m512i*) dst, (__m512i*) (src)) 1586 #define VECT_SET_BLOCK(dst, value) \ 1587  avx512_set_block((__m512i*) dst, (value)) 1589 #define VECT_IS_ZERO_BLOCK(dst) \ 1590  avx512_is_all_zero((__m512i*) dst) 1592 #define VECT_IS_ONE_BLOCK(dst) \ 1593  avx512_is_all_one((__m512i*) dst) 1595 #define VECT_IS_DIGEST_ZERO(start) \ 1596  avx512_is_digest_zero((__m512i*)start) 1598 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \ 1599  avx2_idx_arr_block_lookup(idx, size, nb, start)

ncbi::TMaskedQueryRegions mask

#define BM_AVX2_POPCNT_PROLOG

#define BM_CSA256(h, l, a, b, c)

#define BM_AVX2_BIT_COUNT(ret, v)

static vector< string > arr

bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)

check if 2 wave of pointers are all NULL

bool avx2_or_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)

OR array elements against another unaligned array dst |= *src.

bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)

check if 2 wave of pointers are all the same (NULL or FULL)

unsigned avx2_and_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)

AND array elements against another array (unaligned) dst &= *src.

bm::id_t avx2_bit_count_sub(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

AND NOT bit count for two aligned bit-blocks.

bool avx2_test_all_zero_wave(const void *ptr)

check if wave of pointers is all NULL

bm::id_t avx2_bit_count_xor(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

XOR bit count for two aligned bit-blocks.

bm::id_t avx2_bit_count(const __m256i *block, const __m256i *block_end)

AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...

bm::id_t avx2_bit_count_and(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

AND bit count for two aligned bit-blocks.

void avx512_copy_block(__m512i *dst, const __m512i *src)

block copy dst = *src

void avx512_xor_arr_2_mask(__m512i *dst, const __m512i *src, const __m512i *src_end, bm::word_t mask)

XOR array elements to specified mask dst = *src ^ mask.

bool avx512_or_block_5way(__m512i *dst, const __m512i *src1, const __m512i *src2, const __m512i *src3, const __m512i *src4)

OR array elements against another 4 arrays dst |= *src1 | src2.

bool avx512_test_zero(__m512i m)

bool avx512_and_digest_2way(__m512i *dst, const __m512i *src1, const __m512i *src2)

AND block digest stride 2 way dst = *src1 & *src2.

bool avx512_or_block(__m512i *dst, const __m512i *src)

OR array elements against another array dst |= *src.

bool avx512_or_block_2way(__m512i *dst, const __m512i *src1, const __m512i *src2)

OR 2 blocks, copy to destination dst = *src1 | src2.

void avx512_invert_block(__m512i *dst)

Invert bit-block dst = ~*dst or dst ^= *dst.

void avx512_andnot_arr_2_mask(__m512i *dst, const __m512i *src, const __m512i *src_end, bm::word_t mask)

Inverts array elements and NOT them to specified mask dst = ~*src & mask.

unsigned avx512_and_block(__m512i *dst, const __m512i *src)

AND array elements against another array dst &= *src.

bool avx512_test_one(__m512i m)

unsigned avx512_sub_block(__m512i *dst, const __m512i *src)

AND-NOT (SUB) array elements against another array dst &= ~*src.

bool avx512_is_digest_zero(const __m512i *block)

check if digest stride is all zero bits

unsigned avx512_xor_block(__m512i *dst, const __m512i *src)

XOR block against another dst ^= *src.

bool avx512_is_all_one(const __m512i *block)

check if block is all one bits

bool avx512_is_all_zero(const __m512i *block)

check if block is all zero bits

bool avx512_sub_digest(__m512i *dst, const __m512i *src)

SUB (AND NOT) block digest stride dst &= *src.

unsigned avx512_xor_block_2way(__m512i *dst, const __m512i *src1, const __m512i *src2)

3-operand XOR dst = *src1 ^ *src2

void avx512_set_block(__m512i *dst, bm::word_t value)

AVX512 block memset dst = value.

bool avx512_or_block_3way(__m512i *dst, const __m512i *src1, const __m512i *src2)

OR array elements against another 2 arrays dst |= *src1 | src2.

bool avx512_and_digest(__m512i *dst, const __m512i *src)

AND block digest stride dst &= *src.

bm::id_t sse42_bit_block_calc_count_change(const __m128i *block, const __m128i *block_end, unsigned *bit_count)

unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)

const unsigned set_block_mask

const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)

bm::id_t avx2_bit_count_or(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)

const unsigned set_word_shift

const unsigned set_block_size

unsigned long long int id64_t

unsigned short gap_word_t

void avx2_bit_block_gather_scatter(unsigned *arr, const unsigned *blk, const unsigned *idx, unsigned size, unsigned start, unsigned bit_idx)

const unsigned set_block_shift

const unsigned set_word_mask

const struct ncbi::grid::netcache::search::fields::SIZE size

const GenericPointer< typename T::ValueType > T2 value

static __m128i _mm_xor_si128(__m128i a, __m128i b)

static int _mm_cvtsi128_si32(__m128i a)

static int _mm_popcnt_u32(unsigned int a)

static __m128i _mm_add_epi16(__m128i a, __m128i b)

#define _mm_srli_epi32(a, imm)

static void _mm_prefetch(const void *p, int i)

static __m128i _mm_load_si128(const __m128i *p)

#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)

MACRO for shuffle parameter for _mm_shuffle_ps().

#define _mm_extract_epi32(a, imm)


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4