non-working optimized lcs implementation
This commit is contained in:
parent
d793297ad5
commit
7cc7d3bb76
|
@ -0,0 +1,195 @@
|
|||
// Based on https://github.com/mathertel/Diff
|
||||
// "An O(ND) Difference Algorithm and its Variations" by Eugene Myers Algorithmica Vol. 1 No. 2, 1986, p 251.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
struct DiffData {
|
||||
length: usize,
|
||||
codes: Vec<usize>,
|
||||
modified: Vec<bool>,
|
||||
}
|
||||
|
||||
pub fn diff(a: &str, b: &str) {
|
||||
let mut existing_hashes: HashMap<&str, usize> = HashMap::new();
|
||||
let mut data_a = diff_data(a, &mut existing_hashes);
|
||||
let mut data_b = diff_data(b, &mut existing_hashes);
|
||||
|
||||
let max = data_a.length + data_b.length;
|
||||
let mut down_vector = vec![0usize; 2 * max + 2];
|
||||
let mut up_vector = vec![0usize; 2 * max + 2];
|
||||
|
||||
let upper_a = data_a.length;
|
||||
let upper_b = data_b.length;
|
||||
|
||||
lcs(&mut data_a, 0, upper_a, &mut data_b, 0, upper_b, &mut down_vector, &mut up_vector);
|
||||
|
||||
optimize(&data_a);
|
||||
optimize(&data_b);
|
||||
|
||||
create_diffs(&data_a, &data_b)
|
||||
}
|
||||
|
||||
fn diff_data<'a>(text: &'a str, existing_hashes: &mut HashMap<&'a str, usize>) -> DiffData {
|
||||
let codes = diff_codes(text, existing_hashes);
|
||||
let length = codes.len();
|
||||
|
||||
DiffData {
|
||||
length,
|
||||
codes,
|
||||
modified: vec![false; length + 2],
|
||||
}
|
||||
}
|
||||
|
||||
fn diff_codes<'a>(text: &'a str, existing_hashes: &mut HashMap<&'a str, usize>) -> Vec<usize> {
|
||||
let lines: Vec<&str> = text.split('\n').collect();
|
||||
|
||||
let mut codes = vec![0usize; lines.len()];
|
||||
let mut next_code = existing_hashes.len() + 1;
|
||||
|
||||
for i in 0..lines.len() {
|
||||
let line = lines[i];
|
||||
|
||||
if !existing_hashes.contains_key(line) {
|
||||
existing_hashes.insert(line, next_code);
|
||||
codes[i] = next_code;
|
||||
next_code += 1;
|
||||
} else {
|
||||
codes[i] = existing_hashes[line];
|
||||
}
|
||||
}
|
||||
|
||||
return codes;
|
||||
}
|
||||
|
||||
// Longest Common-Subsequence
|
||||
fn lcs(data_a: &mut DiffData, mut lower_a: usize, mut upper_a: usize, data_b: &mut DiffData, mut lower_b: usize, mut upper_b: usize, down_vector: &mut Vec<usize>, up_vector: &mut Vec<usize>) {
|
||||
while lower_a < upper_a && lower_b < upper_b && data_a.codes[lower_a] == data_b.codes[lower_b] {
|
||||
lower_a += 1;
|
||||
lower_b += 1;
|
||||
}
|
||||
|
||||
while lower_a < upper_a && lower_b < upper_b && data_a.codes[upper_a - 1] == data_b.codes[upper_b - 1] {
|
||||
upper_a -= 1;
|
||||
upper_b -= 1;
|
||||
}
|
||||
|
||||
if lower_a == upper_a {
|
||||
// Inserted lines
|
||||
while lower_b < upper_b {
|
||||
lower_b += 1;
|
||||
data_b.modified[lower_b] = true;
|
||||
}
|
||||
} else if lower_b == upper_b {
|
||||
// Deleted lines
|
||||
while lower_a < upper_a {
|
||||
lower_a += 1;
|
||||
data_a.modified[lower_a] = true;
|
||||
}
|
||||
} else {
|
||||
// Find the middle snake and length of an optimal path for A and B
|
||||
let sms = sms(&data_a, &data_b, down_vector, up_vector);
|
||||
|
||||
// The path is from lower_x to (x, y) and (x, y) to upper_x
|
||||
lcs(data_a, lower_a, sms.0, data_b, lower_b, sms.1, down_vector, up_vector);
|
||||
lcs(data_a, sms.1, upper_a, data_b, sms.1, upper_b, down_vector, up_vector);
|
||||
}
|
||||
}
|
||||
|
||||
// Shortest Middle Snake
|
||||
fn sms(data_a: &DiffData, data_b: &DiffData, down_vector: &mut Vec<usize>, up_vector: &mut Vec<usize>) -> (usize, usize) {
|
||||
let lower_a = 0usize;
|
||||
let upper_a = data_a.length;
|
||||
let lower_b = 0usize;
|
||||
let upper_b = data_b.length;
|
||||
|
||||
let mut ret = (0usize, 0usize);
|
||||
let max = data_a.length - data_b.length - 1;
|
||||
|
||||
let down_k = lower_a - lower_b;
|
||||
let up_k = upper_a - upper_b;
|
||||
|
||||
let delta = (upper_a - lower_a) - (upper_b - lower_b);
|
||||
let odd_delta = (delta & 1) != 0;
|
||||
|
||||
let down_offset = max - down_k;
|
||||
let up_offset = max - up_k;
|
||||
|
||||
let max_d = ((upper_a - lower_a + upper_b - lower_b) / 2) + 1;
|
||||
|
||||
down_vector[down_offset + down_k + 1] = lower_a;
|
||||
up_vector[up_offset + up_k - 1] = upper_a;
|
||||
|
||||
for d in 0..=max_d {
|
||||
// Extend the forward path
|
||||
for k in ((down_k - d)..=(down_k + d)).step_by(2) {
|
||||
let mut x = 0;
|
||||
let mut y = 0;
|
||||
if k == down_k - d {
|
||||
// Down
|
||||
x = down_vector[down_offset + k + 1];
|
||||
} else {
|
||||
// Right
|
||||
x = down_vector[down_offset + k - 1];
|
||||
if k < down_k + d && down_vector[down_offset + k + 1] >= 1 {
|
||||
// Down
|
||||
x = down_vector[down_offset + k + 1];
|
||||
}
|
||||
}
|
||||
y = x - k;
|
||||
|
||||
// Find the end of the furthest reaching forward D-path in diagonal k.
|
||||
while x < upper_a && y < upper_b && data_a.codes[x] == data_b.codes[y] {
|
||||
x += 1;
|
||||
y += 1;
|
||||
}
|
||||
|
||||
down_vector[down_offset + k] = x;
|
||||
|
||||
// Overlap ?
|
||||
if odd_delta && up_k - d < k && k < up_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] {
|
||||
ret.0 = down_vector[down_offset + k];
|
||||
ret.1 = down_vector[down_offset + k] - k;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
// Extend the reverse path
|
||||
for k in ((up_k - d)..=(up_k + d)).step_by(2) {
|
||||
let mut x = 0;
|
||||
let mut y = 0;
|
||||
|
||||
if k == up_k + d {
|
||||
// Up
|
||||
x = up_vector[up_offset + k - 1];
|
||||
} else {
|
||||
// Left
|
||||
x = up_vector[up_offset + k + 1] - 1;
|
||||
if k > up_k - d && up_vector[up_offset + k - 1] < x {
|
||||
// Up
|
||||
x = up_vector[up_offset + k - 1];
|
||||
}
|
||||
}
|
||||
y = x - k;
|
||||
|
||||
while x > lower_a && y > lower_b && data_a.codes[x - 1] == data_b.codes[y - 1] {
|
||||
x -= 1;
|
||||
y -= 1;
|
||||
}
|
||||
|
||||
up_vector[up_offset + k] = x;
|
||||
|
||||
// Overlap ?
|
||||
if !odd_delta && down_k - d <= k && k <= down_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] {
|
||||
ret.0 = down_vector[down_offset + k];
|
||||
ret.1 = down_vector[down_offset + k] - k;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
panic!("This should not be possible :(");
|
||||
}
|
||||
|
||||
fn optimize(data: &DiffData) {}
|
||||
|
||||
fn create_diffs(data_a: &DiffData, data_b: &DiffData) {}
|
110
src/main.rs
110
src/main.rs
|
@ -1,60 +1,66 @@
|
|||
use crate::lcs::diff;
|
||||
use crate::diff::diff;
|
||||
|
||||
mod matrix;
|
||||
mod lcs;
|
||||
mod diff;
|
||||
|
||||
fn main() {
|
||||
let a = "abcd";
|
||||
let b = "abce";
|
||||
let a = "abcabba\nlkajsdfasdf\nasdfasdfasdf\nlasjkdf";
|
||||
let b = "abcabba\ncbabasdfasdf\nlasjkdf";
|
||||
|
||||
// diff(a, b);
|
||||
lcs(a, b);
|
||||
diff(a, b);
|
||||
// lcs(a, b);
|
||||
}
|
||||
|
||||
fn lcs(a: &str, b: &str) {
|
||||
let n = a.len();
|
||||
let m = b.len();
|
||||
let max = (n + m) / 2;
|
||||
let mut v = vec![0usize; max * 2];
|
||||
|
||||
for d in 0..max {
|
||||
let mut k = 0usize;
|
||||
while k <= d * 2 {
|
||||
let mut x = if k == 0 || k != d * 2 && v[k - 1] < v[k + 1] {
|
||||
v[k + 1]
|
||||
} else {
|
||||
v[k - 1] + 1
|
||||
};
|
||||
|
||||
let mut y = if k < x {
|
||||
x - k
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
while x < n && y < m {
|
||||
let ac = a.chars().nth(x + 1).unwrap();
|
||||
let bc = b.chars().nth(y + 1).unwrap();
|
||||
|
||||
if ac != bc {
|
||||
break;
|
||||
}
|
||||
|
||||
x = x + 1;
|
||||
y = y + 1;
|
||||
}
|
||||
|
||||
v[k] = x;
|
||||
if x >= n && y >= m {
|
||||
println!("Length of a SES is D ({d})");
|
||||
dbg!(v);
|
||||
return;
|
||||
}
|
||||
|
||||
k += 2;
|
||||
}
|
||||
}
|
||||
|
||||
println!("Length of a SES is greater than MAX ({max})");
|
||||
dbg!(v);
|
||||
}
|
||||
// fn lcs(a: &str, b: &str) {
|
||||
// let n = a.len() as i32;
|
||||
// let m = b.len() as i32;
|
||||
// let max = n + m;
|
||||
// let mut endpoints = vec![0i32; max as usize * 2];
|
||||
//
|
||||
// for script_length in 0..max {
|
||||
// let mut k = -script_length;
|
||||
// while k <= script_length * 2 {
|
||||
// let index = (k + max) as usize + 1;
|
||||
// let previous_endpoint = endpoints[index - 1];
|
||||
// let next_endpoint = endpoints[index + 1];
|
||||
//
|
||||
// let mut x = if k == -script_length || k != script_length && previous_endpoint < next_endpoint {
|
||||
// next_endpoint
|
||||
// } else {
|
||||
// previous_endpoint + 1
|
||||
// };
|
||||
//
|
||||
// let mut y = if k < x {
|
||||
// x - k
|
||||
// } else {
|
||||
// 0
|
||||
// };
|
||||
//
|
||||
// // Increase x and y as long as we are in a common sequence between a and b
|
||||
// while x < n && y < m {
|
||||
// let ac = a.chars().nth(x as usize).unwrap();
|
||||
// let bc = b.chars().nth(y as usize).unwrap();
|
||||
//
|
||||
// if ac != bc {
|
||||
// break;
|
||||
// }
|
||||
//
|
||||
// x += 1;
|
||||
// y += 1;
|
||||
// }
|
||||
//
|
||||
// endpoints[index] = x;
|
||||
//
|
||||
// // We have traveled through both strings, the length of the shortest edit script (SES) has been found.
|
||||
// if x >= n && y >= m {
|
||||
// println!("Length of a SES is D ({d})");
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// k += 2;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// println!("Length of a SES is greater than MAX ({max})");
|
||||
// }
|
||||
|
|
Loading…
Reference in New Issue