A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from https://github.com/marbl/canu/commit/cb94432adf36cbb77c7ec22517a50e3051abd066 below:

overlap rescoring works; improved microsatellite check; ignore d… · marbl/canu@cb94432 · GitHub

@@ -318,24 +318,29 @@ CollectKmerStat(const char* seq, int32 reg_len, int32 kmer_len, int32 *stats) {

318 318 319 319

static

320 320

bool

321 -

CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {

321 +

CheckTrivialDNA(const char* seq, int32 remaining, int32 offset, int32 size_factor, int32 repeat_cnt) {

322 322

//TODO configure trivial DNA analysis

323 -

static const int32 SIZE_FACTOR = 6;

324 -

static const int32 REPEAT_NUM = 5;

323 +

//static const int32 SIZE_FACTOR = 6;

324 +

//static const int32 REPEAT_NUM = 5;

325 +

//static const int32 SIZE_FACTOR = 4;

326 +

//static const int32 REPEAT_NUM = 3;

325 327

static const int32 MIN_K = 2;

326 328

static const int32 MAX_K = 5;

327 329 328 330

int32 stats_buff[1 << (2 * MAX_K)];

329 331

for (int32 k = MIN_K; k <= MAX_K; ++k) {

330 332

const int32 possible_kmer_cnt = 1 << (2 * k);

331 -

int32 reg_len = k * SIZE_FACTOR;

333 +

int32 reg_len = k * size_factor;

332 334 333 335

//exploring sequence to the right

334 -

for (int32 shift = 0; shift < k; ++shift) {

336 +

//fprintf(stderr, "checking upstream k=%d, init_shift=%d\n", k, std::max(-k, -offset));

337 +

for (int32 shift = std::max(-k, -offset); shift < k; ++shift) {

338 + 335 339

if (reg_len + shift > remaining)

336 340

break;

337 341

CollectKmerStat(seq + shift, reg_len, k, stats_buff);

338 -

if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= REPEAT_NUM) {

342 +

if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= repeat_cnt) {

343 +

//comment out!

339 344

//char subbuff[reg_len + 1];

340 345

//memcpy(subbuff, seq + shift, reg_len);

341 346

//subbuff[reg_len] = '\0';

@@ -345,12 +350,14 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {

345 350

}

346 351

}

347 352 353 +

//fprintf(stderr, "checking downstream k=%d, init_shift=%d\n", k, std::max(-k, -remaining));

348 354

//exploring sequence to the left

349 -

for (int32 shift = 0; shift < k; ++shift) {

355 +

for (int32 shift = std::max(-k, -remaining); shift < k; ++shift) {

350 356

if (reg_len + shift > offset)

351 357

break;

352 358

CollectKmerStat(seq - shift - 1, -reg_len, k, stats_buff);

353 -

if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= REPEAT_NUM) {

359 +

if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= repeat_cnt) {

360 +

//comment out!

354 361

//char subbuff[reg_len + 1];

355 362

//memcpy(subbuff, seq - shift - reg_len, reg_len);

356 363

//subbuff[reg_len] = '\0';

@@ -365,24 +372,40 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {

365 372 366 373

static

367 374

bool

368 -

CheckNonTrivialDNA(const char* a_part, const char* b_part,

375 +

CheckTrivialDNA(const char* a_part, const char* b_part,

369 376

int32 a_len, int32 b_len,

370 -

int32 a_pos, int32 b_pos) {

371 -

//fprintf(stderr, "Checking for trivial DNA in A around position %d\n", i);

372 -

if (CheckTrivialDNA(a_part + a_pos, /*remaining*/a_len - a_pos, /*offset*/a_pos))

373 -

return false;

374 -

//fprintf(stderr, "Checking for trivial DNA in B around position %d\n", j);

375 -

if (CheckTrivialDNA(b_part + b_pos, /*remaining*/b_len - b_pos, /*offset*/b_pos))

376 -

return false;

377 -

return true;

377 +

int32 a_pos, int32 b_pos,

378 +

int32 size_factor, int32 repeat_cnt) {

379 +

//fprintf(stderr, "Checking for trivial DNA in around positions %d, %d\n", a_pos, b_pos);

380 + 381 +

if (CheckTrivialDNA(a_part + a_pos, /*remaining*/a_len - a_pos, /*offset*/a_pos, size_factor, repeat_cnt)) {

382 +

//fprintf(stderr, "Trivial DNA detected in A around position %d\n", a_pos);

383 +

return true;

384 +

}

385 +

if (CheckTrivialDNA(b_part + b_pos, /*remaining*/b_len - b_pos, /*offset*/b_pos, size_factor, repeat_cnt)) {

386 +

//fprintf(stderr, "Trivial DNA detected in B around position %d\n", b_pos);

387 +

return true;

388 +

}

389 +

//fprintf(stderr, "NON-Trivial DNA!!!\n", a_pos);

390 +

return false;

378 391

}

379 392 380 393

static

381 394

std::pair<size_t, size_t>

382 395

ComputeErrors(const char* a_part, const char* b_part,

383 396

int32 delta_len, int32 *deltas,

384 397

int32 a_len, int32 b_len,

385 -

bool check_trivial_dna) {

398 +

bool check_trivial_dna,

399 +

uint32 ignore_flank) {

400 + 401 +

static const int32 MM_SIZE_FACTOR = 6;

402 +

static const int32 MM_REPEAT_NUM = 5;

403 + 404 +

//static const int32 IND_SIZE_FACTOR = MM_SIZE_FACTOR;

405 +

//static const int32 IND_REPEAT_NUM = MM_REPEAT_NUM;

406 +

static const int32 IND_SIZE_FACTOR = 4;

407 +

static const int32 IND_REPEAT_NUM = 3;

408 + 386 409

// Event counter. Each individual (1bp) mismatch/insertion/deletion is an event

387 410

int32 all_ct = 0;

388 411

// Processed event counter

@@ -394,6 +417,21 @@ ComputeErrors(const char* a_part, const char* b_part,

394 417

//position in "alignment" of a_part and b_part

395 418

int32 p = 0;

396 419 420 +

auto cnt_event_f = [&](int32 size_factor, int32 repeat_cnt) {

421 +

if (i < ignore_flank ||

422 +

j < ignore_flank ||

423 +

i + ignore_flank >= a_len ||

424 +

j + ignore_flank >= b_len) {

425 +

return false;

426 +

}

427 +

if (check_trivial_dna &&

428 +

CheckTrivialDNA(a_part, b_part, a_len, b_len, i, j,

429 +

size_factor, repeat_cnt)) {

430 +

return false;

431 +

}

432 +

return true;

433 +

};

434 + 397 435

for (int32 k=0; k < delta_len; k++) {

398 436

//fprintf(stderr, "k=%d deltalen=%d i=%d our of %d j=%d out of %d\n", k, wa->ped.deltaLen, i, a_len, j, b_len);

399 437

@@ -404,8 +442,7 @@ ComputeErrors(const char* a_part, const char* b_part,

404 442

//fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);

405 443 406 444

all_ct++;

407 -

if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))

408 -

ct++;

445 +

ct += (int32) cnt_event_f(MM_SIZE_FACTOR, MM_REPEAT_NUM);

409 446

}

410 447 411 448

i++; //assert(i <= a_len);

@@ -419,8 +456,7 @@ ComputeErrors(const char* a_part, const char* b_part,

419 456

//Insertion at i - 1 in a_part (p in "alignment")

420 457

//fprintf(stderr, "INSERT %c at %d #%d\n", b_part[j], i-1, p);

421 458

all_ct++;

422 -

if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))

423 -

ct++;

459 +

ct += (int32) cnt_event_f(IND_SIZE_FACTOR, IND_REPEAT_NUM);

424 460 425 461

j++; //assert(j <= b_len);

426 462

p++;

@@ -432,8 +468,7 @@ ComputeErrors(const char* a_part, const char* b_part,

432 468

//Deletion at i in a_part (p in "alignment")

433 469

//fprintf(stderr, "DELETE %c at %d #%d\n", a_part[i], i, p);

434 470

all_ct++;

435 -

if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))

436 -

ct++;

471 +

ct += (int32) cnt_event_f(IND_SIZE_FACTOR, IND_REPEAT_NUM);

437 472 438 473

i++; //assert(i <= a_len);

439 474

p++;

@@ -446,8 +481,7 @@ ComputeErrors(const char* a_part, const char* b_part,

446 481

//fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);

447 482

//Substitution at i in a_part (p in "alignment")

448 483

all_ct++;

449 -

if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))

450 -

ct++;

484 +

ct += (int32) cnt_event_f(MM_SIZE_FACTOR, MM_REPEAT_NUM);

451 485

}

452 486 453 487

i++; //assert(i <= a_len); // Guaranteed, we're looping on this

@@ -459,8 +493,8 @@ ComputeErrors(const char* a_part, const char* b_part,

459 493

// fprintf(stderr, "Reported %d out of %d\n", ct, all_ct);

460 494

//}

461 495 462 -

assert(i <= a_len);

463 -

assert(j <= b_len);

496 +

assert(i == a_len);

497 +

assert(j == b_len);

464 498 465 499

return std::make_pair(ct, p);

466 500

}

@@ -519,6 +553,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par

519 553

*match_to_end,

520 554

ped);

521 555 556 + 522 557

//Adjusting the extremities

523 558

//TODO discuss the logic!

524 559

//TODO refactor out the code duplication

@@ -560,6 +595,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par

560 595

all_errors -= i;

561 596

}

562 597 598 +

//fprintf(stderr, "Showing alignment\n");

563 599

//Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);

564 600 565 601

*invalid_olap = (std::min(a_end, b_end) <= 0);

@@ -571,13 +607,13 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par

571 607

int32 events;

572 608

int32 alignment_len;

573 609 610 +

//fprintf(stderr, "Computing errors\n");

611 +

static const uint32 FLANK_IGNORE = 5;

574 612

//fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);

575 613

std::tie(events, alignment_len) = ComputeErrors(a_part, b_part, ped->deltaLen, ped->delta,

576 -

a_end, b_end, check_trivial_dna);

614 +

a_end, b_end, check_trivial_dna, FLANK_IGNORE);

577 615 578 -

if (!check_trivial_dna && all_errors != events) {

579 -

fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);

580 -

}

616 +

//fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);

581 617

assert(check_trivial_dna || all_errors == events);

582 618 583 619

assert(events >= 0 && alignment_len > 0);

@@ -661,8 +697,8 @@ Redo_Olaps(coParameters *G, /*const*/ sqStore *seqStore) {

661 697

for (; thisOvl <= lastOvl && G->olaps[thisOvl].b_iid == curID; thisOvl++) {

662 698

const Olap_Info_t &olap = G->olaps[thisOvl];

663 699 664 -

//if (olap.b_iid != 39861)

665 -

// continue;

700 +

if (G->secondID != UINT32_MAX && olap.b_iid != G->secondID)

701 +

continue;

666 702 667 703

if (olap.normal) {

668 704

// fprintf(stderr, "b_part = fseq %40.40s\n", fseq);


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4