@@ -318,24 +318,29 @@ CollectKmerStat(const char* seq, int32 reg_len, int32 kmer_len, int32 *stats) {
318
318
319
319
static
320
320
bool
321
-
CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
321
+
CheckTrivialDNA(const char* seq, int32 remaining, int32 offset, int32 size_factor, int32 repeat_cnt) {
322
322
//TODO configure trivial DNA analysis
323
-
static const int32 SIZE_FACTOR = 6;
324
-
static const int32 REPEAT_NUM = 5;
323
+
//static const int32 SIZE_FACTOR = 6;
324
+
//static const int32 REPEAT_NUM = 5;
325
+
//static const int32 SIZE_FACTOR = 4;
326
+
//static const int32 REPEAT_NUM = 3;
325
327
static const int32 MIN_K = 2;
326
328
static const int32 MAX_K = 5;
327
329
328
330
int32 stats_buff[1 << (2 * MAX_K)];
329
331
for (int32 k = MIN_K; k <= MAX_K; ++k) {
330
332
const int32 possible_kmer_cnt = 1 << (2 * k);
331
-
int32 reg_len = k * SIZE_FACTOR;
333
+
int32 reg_len = k * size_factor;
332
334
333
335
//exploring sequence to the right
334
-
for (int32 shift = 0; shift < k; ++shift) {
336
+
//fprintf(stderr, "checking upstream k=%d, init_shift=%d\n", k, std::max(-k, -offset));
337
+
for (int32 shift = std::max(-k, -offset); shift < k; ++shift) {
338
+
335
339
if (reg_len + shift > remaining)
336
340
break;
337
341
CollectKmerStat(seq + shift, reg_len, k, stats_buff);
338
-
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= REPEAT_NUM) {
342
+
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= repeat_cnt) {
343
+
//comment out!
339
344
//char subbuff[reg_len + 1];
340
345
//memcpy(subbuff, seq + shift, reg_len);
341
346
//subbuff[reg_len] = '\0';
@@ -345,12 +350,14 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
345
350
}
346
351
}
347
352
353
+
//fprintf(stderr, "checking downstream k=%d, init_shift=%d\n", k, std::max(-k, -remaining));
348
354
//exploring sequence to the left
349
-
for (int32 shift = 0; shift < k; ++shift) {
355
+
for (int32 shift = std::max(-k, -remaining); shift < k; ++shift) {
350
356
if (reg_len + shift > offset)
351
357
break;
352
358
CollectKmerStat(seq - shift - 1, -reg_len, k, stats_buff);
353
-
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= REPEAT_NUM) {
359
+
if (*std::max_element(stats_buff, stats_buff + possible_kmer_cnt) >= repeat_cnt) {
360
+
//comment out!
354
361
//char subbuff[reg_len + 1];
355
362
//memcpy(subbuff, seq - shift - reg_len, reg_len);
356
363
//subbuff[reg_len] = '\0';
@@ -365,24 +372,40 @@ CheckTrivialDNA(const char* seq, int32 remaining, int32 offset) {
365
372
366
373
static
367
374
bool
368
-
CheckNonTrivialDNA(const char* a_part, const char* b_part,
375
+
CheckTrivialDNA(const char* a_part, const char* b_part,
369
376
int32 a_len, int32 b_len,
370
-
int32 a_pos, int32 b_pos) {
371
-
//fprintf(stderr, "Checking for trivial DNA in A around position %d\n", i);
372
-
if (CheckTrivialDNA(a_part + a_pos, /*remaining*/a_len - a_pos, /*offset*/a_pos))
373
-
return false;
374
-
//fprintf(stderr, "Checking for trivial DNA in B around position %d\n", j);
375
-
if (CheckTrivialDNA(b_part + b_pos, /*remaining*/b_len - b_pos, /*offset*/b_pos))
376
-
return false;
377
-
return true;
377
+
int32 a_pos, int32 b_pos,
378
+
int32 size_factor, int32 repeat_cnt) {
379
+
//fprintf(stderr, "Checking for trivial DNA in around positions %d, %d\n", a_pos, b_pos);
380
+
381
+
if (CheckTrivialDNA(a_part + a_pos, /*remaining*/a_len - a_pos, /*offset*/a_pos, size_factor, repeat_cnt)) {
382
+
//fprintf(stderr, "Trivial DNA detected in A around position %d\n", a_pos);
383
+
return true;
384
+
}
385
+
if (CheckTrivialDNA(b_part + b_pos, /*remaining*/b_len - b_pos, /*offset*/b_pos, size_factor, repeat_cnt)) {
386
+
//fprintf(stderr, "Trivial DNA detected in B around position %d\n", b_pos);
387
+
return true;
388
+
}
389
+
//fprintf(stderr, "NON-Trivial DNA!!!\n", a_pos);
390
+
return false;
378
391
}
379
392
380
393
static
381
394
std::pair<size_t, size_t>
382
395
ComputeErrors(const char* a_part, const char* b_part,
383
396
int32 delta_len, int32 *deltas,
384
397
int32 a_len, int32 b_len,
385
-
bool check_trivial_dna) {
398
+
bool check_trivial_dna,
399
+
uint32 ignore_flank) {
400
+
401
+
static const int32 MM_SIZE_FACTOR = 6;
402
+
static const int32 MM_REPEAT_NUM = 5;
403
+
404
+
//static const int32 IND_SIZE_FACTOR = MM_SIZE_FACTOR;
405
+
//static const int32 IND_REPEAT_NUM = MM_REPEAT_NUM;
406
+
static const int32 IND_SIZE_FACTOR = 4;
407
+
static const int32 IND_REPEAT_NUM = 3;
408
+
386
409
// Event counter. Each individual (1bp) mismatch/insertion/deletion is an event
387
410
int32 all_ct = 0;
388
411
// Processed event counter
@@ -394,6 +417,21 @@ ComputeErrors(const char* a_part, const char* b_part,
394
417
//position in "alignment" of a_part and b_part
395
418
int32 p = 0;
396
419
420
+
auto cnt_event_f = [&](int32 size_factor, int32 repeat_cnt) {
421
+
if (i < ignore_flank ||
422
+
j < ignore_flank ||
423
+
i + ignore_flank >= a_len ||
424
+
j + ignore_flank >= b_len) {
425
+
return false;
426
+
}
427
+
if (check_trivial_dna &&
428
+
CheckTrivialDNA(a_part, b_part, a_len, b_len, i, j,
429
+
size_factor, repeat_cnt)) {
430
+
return false;
431
+
}
432
+
return true;
433
+
};
434
+
397
435
for (int32 k=0; k < delta_len; k++) {
398
436
//fprintf(stderr, "k=%d deltalen=%d i=%d our of %d j=%d out of %d\n", k, wa->ped.deltaLen, i, a_len, j, b_len);
399
437
@@ -404,8 +442,7 @@ ComputeErrors(const char* a_part, const char* b_part,
404
442
//fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);
405
443
406
444
all_ct++;
407
-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
408
-
ct++;
445
+
ct += (int32) cnt_event_f(MM_SIZE_FACTOR, MM_REPEAT_NUM);
409
446
}
410
447
411
448
i++; //assert(i <= a_len);
@@ -419,8 +456,7 @@ ComputeErrors(const char* a_part, const char* b_part,
419
456
//Insertion at i - 1 in a_part (p in "alignment")
420
457
//fprintf(stderr, "INSERT %c at %d #%d\n", b_part[j], i-1, p);
421
458
all_ct++;
422
-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
423
-
ct++;
459
+
ct += (int32) cnt_event_f(IND_SIZE_FACTOR, IND_REPEAT_NUM);
424
460
425
461
j++; //assert(j <= b_len);
426
462
p++;
@@ -432,8 +468,7 @@ ComputeErrors(const char* a_part, const char* b_part,
432
468
//Deletion at i in a_part (p in "alignment")
433
469
//fprintf(stderr, "DELETE %c at %d #%d\n", a_part[i], i, p);
434
470
all_ct++;
435
-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
436
-
ct++;
471
+
ct += (int32) cnt_event_f(IND_SIZE_FACTOR, IND_REPEAT_NUM);
437
472
438
473
i++; //assert(i <= a_len);
439
474
p++;
@@ -446,8 +481,7 @@ ComputeErrors(const char* a_part, const char* b_part,
446
481
//fprintf(stderr, "SUBST %c -> %c at %d #%d\n", a_part[i], b_part[j], i, p);
447
482
//Substitution at i in a_part (p in "alignment")
448
483
all_ct++;
449
-
if (!check_trivial_dna || CheckNonTrivialDNA(a_part, b_part, a_len, b_len, i, j))
450
-
ct++;
484
+
ct += (int32) cnt_event_f(MM_SIZE_FACTOR, MM_REPEAT_NUM);
451
485
}
452
486
453
487
i++; //assert(i <= a_len); // Guaranteed, we're looping on this
@@ -459,8 +493,8 @@ ComputeErrors(const char* a_part, const char* b_part,
459
493
// fprintf(stderr, "Reported %d out of %d\n", ct, all_ct);
460
494
//}
461
495
462
-
assert(i <= a_len);
463
-
assert(j <= b_len);
496
+
assert(i == a_len);
497
+
assert(j == b_len);
464
498
465
499
return std::make_pair(ct, p);
466
500
}
@@ -519,6 +553,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
519
553
*match_to_end,
520
554
ped);
521
555
556
+
522
557
//Adjusting the extremities
523
558
//TODO discuss the logic!
524
559
//TODO refactor out the code duplication
@@ -560,6 +595,7 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
560
595
all_errors -= i;
561
596
}
562
597
598
+
//fprintf(stderr, "Showing alignment\n");
563
599
//Display_Alignment(a_part, a_end, b_part, b_end, ped->delta, ped->deltaLen);
564
600
565
601
*invalid_olap = (std::min(a_end, b_end) <= 0);
@@ -571,13 +607,13 @@ ProcessAlignment(int32 a_part_len, const char *a_part, int64 a_hang, int32 b_par
571
607
int32 events;
572
608
int32 alignment_len;
573
609
610
+
//fprintf(stderr, "Computing errors\n");
611
+
static const uint32 FLANK_IGNORE = 5;
574
612
//fprintf(stderr, "Checking for trivial DNA regions: %d\n", check_trivial_dna);
575
613
std::tie(events, alignment_len) = ComputeErrors(a_part, b_part, ped->deltaLen, ped->delta,
576
-
a_end, b_end, check_trivial_dna);
614
+
a_end, b_end, check_trivial_dna, FLANK_IGNORE);
577
615
578
-
if (!check_trivial_dna && all_errors != events) {
579
-
fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
580
-
}
616
+
//fprintf(stderr, "Old errors %d new events %d\n", all_errors, events);
581
617
assert(check_trivial_dna || all_errors == events);
582
618
583
619
assert(events >= 0 && alignment_len > 0);
@@ -661,8 +697,8 @@ Redo_Olaps(coParameters *G, /*const*/ sqStore *seqStore) {
661
697
for (; thisOvl <= lastOvl && G->olaps[thisOvl].b_iid == curID; thisOvl++) {
662
698
const Olap_Info_t &olap = G->olaps[thisOvl];
663
699
664
-
//if (olap.b_iid != 39861)
665
-
// continue;
700
+
if (G->secondID != UINT32_MAX && olap.b_iid != G->secondID)
701
+
continue;
666
702
667
703
if (olap.normal) {
668
704
// fprintf(stderr, "b_part = fseq %40.40s\n", fseq);
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4