Last active
November 20, 2021 19:01
-
-
Save memononen/ff90a46ff55c20fc4655952c899239ff to your computer and use it in GitHub Desktop.
O(NP) diff with backtracking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <vector> | |
#include <algorithm> | |
// Based on "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi Manber and Gene Myers | |
struct Edit { | |
enum Type { Insert, Delete, Common }; | |
Edit() = default; | |
Edit(Type _type, int _a, int _b, int _n) : type(_type), a(_a), b(_b), n(_n) {} | |
Type type = Insert; // type of edit | |
int a = 0; // index in string a | |
int b = 0; // index in string b | |
int n = 0; // number of characters | |
}; | |
struct Diag { | |
Diag() = default; | |
Diag(int _x, int _y, int _len, int _next) : x(_x), y(_y), len(_len), next(_next) {} | |
int x = 0; // start position | |
int y = 0; | |
int len = 0; // length | |
int next = -1; // next diagonal towards the start | |
}; | |
struct Pt { | |
Pt() = default; | |
Pt(int _y, int _diag) : y(_y), diag(_diag) {} | |
int y = 0; // furthest y | |
int diag = -1; // nearest diagonal | |
}; | |
static void snake(const int k, | |
const char* a, const char* b, const int m, const int n, | |
std::vector<Pt>& fp, const int fp0, std::vector<Diag>& diags) | |
{ | |
const Pt& pa = fp[fp0 + k-1]; | |
const Pt& pb = fp[fp0 + k+1]; | |
const bool below = (pa.y+1) > pb.y; | |
const int diag = below ? pa.diag : pb.diag; | |
int y = below ? (pa.y+1) : pb.y; | |
int x = y - k; | |
int len = 0; | |
while (x < m && y < n && a[x] == b[y]) { | |
x++; y++; len++; | |
} | |
if (len > 0) { | |
diags.push_back(Diag(x-len, y-len, len, diag)); | |
fp[fp0 + k] = Pt(y, diags.size() - 1); | |
} else { | |
fp[fp0 + k] = Pt(y, diag); | |
} | |
}; | |
std::vector<Edit> diff(const char* a, const char* b) | |
{ | |
int m = strlen(a); | |
int n = strlen(b); | |
bool reverse = false; | |
if (m > n) { | |
std::swap(a, b); | |
std::swap(m, n); | |
reverse = true; | |
} | |
std::vector<Pt> fp; | |
std::vector<Diag> diags; | |
const int delta = n - m; | |
const int size = (m+1) + (n+1) + 1; | |
const int fp0 = m+1; // zero offset for furthest point indexing, it can go negative. | |
fp.resize(size); | |
// All paths will lead to empty diagonal at zero. | |
diags.push_back(Diag(0, 0, 0, -1)); | |
std::fill(fp.begin(), fp.end(), Pt(-1,0)); | |
// Calculate common diagonal sequences | |
for (int p = 0; fp[fp0 + delta].y != n; p++) { | |
for (int k = -p; k <= delta-1; k++) | |
snake(k, a,b,m,n,fp,fp0,diags); | |
for (int k = delta+p; k >= delta+1; k--) | |
snake(k, a,b,m,n,fp,fp0,diags); | |
snake(delta, a,b,m,n,fp,fp0,diags); | |
} | |
// Backtrace shortest edit script | |
std::vector<Edit> ses; | |
Diag pdiag(m, n, 0, -1); | |
for (int d = fp[fp0 + delta].diag; d != -1; d = diags[d].next) { | |
const Diag& diag = diags[d]; | |
// The path between the diagonals is handled with inserts and deletes. | |
int ndel = pdiag.x - (diag.x+diag.len); | |
int nins = pdiag.y - (diag.y+diag.len); | |
int ncom = diag.len; | |
int x = diag.x, y = diag.y; | |
if (reverse) { | |
std::swap(ndel, nins); | |
std::swap(x, y); | |
} | |
if (nins > 0) | |
ses.push_back(Edit(Edit::Insert, x+ncom+ndel, y+ncom, nins)); | |
if (ndel > 0) | |
ses.push_back(Edit(Edit::Delete, x+ncom, y+ncom, ndel)); | |
if (ncom > 0) | |
ses.push_back(Edit(Edit::Common, x, y, ncom)); | |
pdiag = diag; | |
} | |
// Backtrace left the sequence in reverse, flip it. | |
std::reverse(ses.begin(), ses.end()); | |
return ses; | |
} | |
int main() | |
{ | |
const char* b = "Lorem ipsum dolor sit amet"; | |
const char* a = "A quick brown fox jumps over the lazy dog"; | |
std::vector<Edit> ses = diff(a, b); | |
printf("A: "); | |
for (const Edit& ed : ses) { | |
switch (ed.type) | |
{ | |
case Edit::Common: | |
for (int k = 0; k < ed.n; k++) | |
printf("%c", a[ed.a+k]); | |
break; | |
case Edit::Insert: | |
for (int k = 0; k < ed.n; k++) | |
printf(" "); | |
break; | |
case Edit::Delete: | |
printf("\u001b[31m"); | |
for (int k = 0; k < ed.n; k++) | |
printf("%c", a[ed.a+k]); | |
printf("\u001b[0m"); | |
break; | |
} | |
} | |
printf("\u001b[0m\n"); | |
printf("B: "); | |
for (const Edit& ed : ses) { | |
switch (ed.type) | |
{ | |
case Edit::Common: | |
for (int k = 0; k < ed.n; k++) | |
printf("%c", b[ed.b+k]); | |
break; | |
case Edit::Insert: | |
printf("\u001b[32m"); | |
for (int k = 0; k < ed.n; k++) | |
printf("%c", b[ed.b+k]); | |
printf("\u001b[0m"); | |
break; | |
case Edit::Delete: | |
for (int k = 0; k < ed.n; k++) | |
printf(" "); | |
break; | |
} | |
} | |
printf("\u001b[0m\n"); | |
return 0; | |
} |
Author
memononen
commented
Nov 19, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment