Last active
October 30, 2018 03:20
-
-
Save lh3/8cc5817d56f0fb0be54139338be778b6 to your computer and use it in GitHub Desktop.
Position-specific gap open penalty
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include <assert.h> | |
#include "ksw2.h" | |
unsigned char seq_nt4_table[256] = { | |
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
}; | |
static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b) | |
{ | |
int i, j; | |
a = a < 0? -a : a; | |
b = b > 0? -b : b; | |
for (i = 0; i < m - 1; ++i) { | |
for (j = 0; j < m - 1; ++j) | |
mat[i * m + j] = i == j? a : b; | |
mat[i * m + m - 1] = 0; | |
} | |
for (j = 0; j < m; ++j) | |
mat[(m - 1) * m + j] = 0; | |
} | |
int main(int argc, char *argv[]) | |
{ | |
int8_t a = 6, b = 18, q = 30, e = 5; | |
int c, w = -1; | |
while ((c = getopt(argc, argv, "w:A:B:O:E:")) >= 0) { | |
if (c == 'w') w = atoi(optarg); | |
else if (c == 'A') a = atoi(optarg); | |
else if (c == 'B') b = atoi(optarg); | |
else if (c == 'O') q = atoi(optarg); | |
else if (c == 'E') e = atoi(optarg); | |
} | |
if (argc - optind < 2) { | |
fprintf(stderr, "Usage: ksw2-test [options] <DNA-target> <DNA-query>\n"); | |
fprintf(stderr, "Options:\n"); | |
fprintf(stderr, " -A INT match score [%d]\n", a); | |
fprintf(stderr, " -B INT mismatch penalty [%d]\n", b); | |
fprintf(stderr, " -O INT gap open penalty [%d]\n", q); | |
fprintf(stderr, " -E INT gap extension penalty [%d]\n", e); | |
return 1; | |
} else { | |
int tlen, qlen, i, j, k, l, score, m_cigar = 0, n_cigar = 0, blen; | |
uint32_t *cigar = 0; | |
uint8_t *tseq, *qseq; | |
int8_t mat[25], *pso = 0; | |
char *out[3]; | |
// prepare for the alignment | |
ksw_gen_simple_mat(5, mat, a, -b); | |
tlen = strlen(argv[optind]); | |
qlen = strlen(argv[optind+1]); | |
if (optind + 2 < argc) { | |
j = strlen(argv[optind+2]); | |
assert(j == tlen); | |
} | |
tseq = (uint8_t*)calloc(tlen * 2 + qlen, 1); | |
qseq = tseq + tlen; | |
pso = (int8_t*)qseq + qlen; | |
for (j = 0; j < tlen; ++j) tseq[j] = seq_nt4_table[(uint8_t)argv[optind][j]]; | |
for (j = 0; j < qlen; ++j) qseq[j] = seq_nt4_table[(uint8_t)argv[optind + 1][j]]; | |
if (optind + 2 < argc) { | |
for (j = 0; j < tlen; ++j) { | |
int c = (int)argv[optind + 2][j] - '0'; | |
assert(c >= 0 && c <= 127); | |
pso[j] = c; | |
} | |
} | |
if (w < 0) w = tlen > qlen? tlen : qlen; | |
// perform alignment | |
score = ksw_ggd(0, qlen, qseq, tlen, tseq, 5, mat, q, e, w, pso, &m_cigar, &n_cigar, &cigar); | |
// print alignment | |
for (k = blen = 0; k < n_cigar; ++k) blen += cigar[k] >> 4; | |
out[0] = (char*)calloc((blen + 1) * 3, 1); | |
out[1] = out[0] + blen + 1; | |
out[2] = out[1] + blen + 1; | |
for (k = i = j = l = 0; k < n_cigar; ++k) { | |
int t, op = cigar[k] & 0xf, len = cigar[k] >> 4; | |
if (op == 0) { | |
for (t = 0; t < len; ++t) { | |
out[0][l] = argv[optind][i + t]; | |
out[2][l] = argv[optind+1][j + t]; | |
out[1][l++] = tseq[i + t] == qseq[j + t] && tseq[i + t] < 4? '|' : ' '; | |
} | |
i += len, j += len; | |
} else if (op == 1) { | |
for (t = 0; t < len; ++t) | |
out[0][l] = '-', out[2][l] = argv[optind+1][j + t], out[1][l++] = ' '; | |
j += len; | |
} else { | |
for (t = 0; t < len; ++t) | |
out[0][l] = argv[optind][i + t], out[2][l] = '-', out[1][l++] = ' '; | |
i += len; | |
} | |
} | |
printf("Score: %d\n%s\n%s\n%s\n", score, out[0], out[1], out[2]); | |
free(tseq); | |
free(out[0]); | |
} | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef KSW2_H_ | |
#define KSW2_H_ | |
#include <stdint.h> | |
#include <stdlib.h> | |
#define KSW_NEG_INF -0x40000000 | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
/** | |
* Global alignment | |
* | |
* (first 10 parameters identical to ksw_extz_sse()) | |
* @param m_cigar (modified) max CIGAR length; feed 0 if cigar==0 | |
* @param n_cigar (out) number of CIGAR elements | |
* @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with kfree(km, ) | |
* | |
* @return score of the alignment | |
*/ | |
int ksw_ggd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, | |
int8_t *pso, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); | |
#ifdef __cplusplus | |
} | |
#endif | |
/************************************ | |
*** Private macros and functions *** | |
************************************/ | |
static inline uint32_t *ksw_push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len) | |
{ | |
if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { | |
if (*n_cigar == *m_cigar) { | |
*m_cigar = *m_cigar? (*m_cigar)<<1 : 4; | |
cigar = (uint32_t*)realloc(cigar, (*m_cigar) << 2); | |
} | |
cigar[(*n_cigar)++] = len<<4 | op; | |
} else cigar[(*n_cigar)-1] += len<<4; | |
return cigar; | |
} | |
// In the backtrack matrix, value p[] has the following structure: | |
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F} | |
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E}) | |
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F}) | |
static inline void ksw_backtrack(int is_rot, int is_rev, int min_intron_len, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, | |
int *m_cigar_, int *n_cigar_, uint32_t **cigar_) | |
{ // p[] - lower 3 bits: which type gets the max; bit | |
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0; | |
uint32_t *cigar = *cigar_, tmp; | |
while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check | |
int force_state = -1; | |
if (is_rot) { | |
r = i + j; | |
if (i < off[r]) force_state = 2; | |
if (off_end && i > off_end[r]) force_state = 1; | |
tmp = force_state < 0? p[(size_t)r * n_col + i - off[r]] : 0; | |
} else { | |
if (j < off[i]) force_state = 2; | |
if (off_end && j > off_end[i]) force_state = 1; | |
tmp = force_state < 0? p[(size_t)i * n_col + j - off[i]] : 0; | |
} | |
if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it. | |
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H | |
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure | |
if (force_state >= 0) state = force_state; | |
if (state == 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match | |
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion | |
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 3, 1), --i; // intron | |
else cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion | |
} | |
if (i >= 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion | |
if (j >= 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion | |
if (!is_rev) | |
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR | |
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; | |
*m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar; | |
} | |
#endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> // for debugging only | |
#include <assert.h> | |
#include "ksw2.h" | |
typedef struct { int32_t h, e; } eh_t; | |
int ksw_ggd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, | |
int8_t *pso, int *m_cigar_, int *n_cigar_, uint32_t **cigar_) | |
{ | |
eh_t *eh; | |
int8_t *qp; // query profile | |
int32_t i, j, k, gapoe = gapo + gape, score, n_col, *off = 0; | |
uint8_t *z = 0; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be more complex | |
assert(m_cigar_ && n_cigar_ && cigar_); | |
// allocate memory | |
if (w < 0) w = tlen > qlen? tlen : qlen; | |
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix | |
qp = (int8_t*)malloc(qlen * m); | |
eh = (eh_t*)calloc(qlen + 1, 8); | |
if (m_cigar_ && n_cigar_ && cigar_) { | |
*n_cigar_ = 0; | |
z = (uint8_t*)malloc((size_t)n_col * tlen); | |
off = (int32_t*)calloc(tlen, 4); | |
} | |
// generate the query profile | |
for (k = i = 0; k < m; ++k) { | |
const int8_t *p = &mat[k * m]; | |
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; | |
} | |
// fill the first row | |
eh[0].h = 0, eh[0].e = -gapoe - gapoe; | |
for (j = 1; j <= qlen && j <= w; ++j) | |
eh[j].h = -(gapoe + gape * (j - 1)), eh[j].e = -(gapoe + gapoe + gape * j); | |
for (; j <= qlen; ++j) eh[j].h = eh[j].e = KSW_NEG_INF; // everything is -inf outside the band | |
// DP loop | |
for (i = 0; i < tlen; ++i) { // target sequence is in the outer loop | |
int32_t f = KSW_NEG_INF, h1, st, en, os = pso? -pso[i] : 0, gapoe2 = gapoe + os; | |
int8_t *q = &qp[target[i] * qlen]; | |
uint8_t *zi = &z[(long)i * n_col]; | |
st = i > w? i - w : 0; | |
en = i + w + 1 < qlen? i + w + 1 : qlen; | |
h1 = st > 0? KSW_NEG_INF : -(gapoe + gape * i); | |
f = st > 0? KSW_NEG_INF : -(gapoe + gapoe + os + gape * i); | |
off[i] = st; | |
for (j = st; j < en; ++j) { | |
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) | |
// Cells are computed in the following order: | |
// H(i,j) = max{H(i-1,j-1) + S(i,j), E(i,j), F(i,j)} | |
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape | |
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape | |
eh_t *p = &eh[j]; | |
int32_t h = p->h, e = p->e; | |
uint8_t d; // direction | |
p->h = h1; | |
h += q[j]; | |
d = h >= e? 0 : 1; | |
h = h >= e? h : e; | |
d = h >= f? d : 2; | |
h = h >= f? h : f; | |
h1 = h; | |
h -= gapoe2; | |
e -= gape; | |
d |= e > h? 0x08 : 0; | |
e = e > h? e : h; | |
p->e = e; | |
f -= gape; | |
d |= f > h? 0x10 : 0; // if we want to halve the memory, use one bit only, instead of two | |
f = f > h? f : h; | |
zi[j - st] = d; // z[i,j] keeps h for the current cell and e/f for the next cell | |
} | |
eh[en].h = h1, eh[en].e = KSW_NEG_INF; | |
} | |
// backtrack | |
score = eh[qlen].h; | |
free(qp); free(eh); | |
if (m_cigar_ && n_cigar_ && cigar_) { | |
ksw_backtrack(0, 0, 0, z, off, 0, n_col, tlen-1, qlen-1, m_cigar_, n_cigar_, cigar_); | |
free(z); free(off); | |
} | |
return score; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
gg:ksw2_ggd.c cli.c ksw2.h | |
$(CC) -Wall -g -O2 -o $@ ksw2_ggd.c cli.c | |
clean: | |
rm -fr *.o *.dSYM gg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to run:
Output: