diff --git a/src/diff.rs b/src/diff.rs new file mode 100644 index 0000000..e826378 --- /dev/null +++ b/src/diff.rs @@ -0,0 +1,195 @@ +// Based on https://github.com/mathertel/Diff +// "An O(ND) Difference Algorithm and its Variations" by Eugene Myers Algorithmica Vol. 1 No. 2, 1986, p 251. + +use std::collections::HashMap; + +struct DiffData { + length: usize, + codes: Vec, + modified: Vec, +} + +pub fn diff(a: &str, b: &str) { + let mut existing_hashes: HashMap<&str, usize> = HashMap::new(); + let mut data_a = diff_data(a, &mut existing_hashes); + let mut data_b = diff_data(b, &mut existing_hashes); + + let max = data_a.length + data_b.length; + let mut down_vector = vec![0usize; 2 * max + 2]; + let mut up_vector = vec![0usize; 2 * max + 2]; + + let upper_a = data_a.length; + let upper_b = data_b.length; + + lcs(&mut data_a, 0, upper_a, &mut data_b, 0, upper_b, &mut down_vector, &mut up_vector); + + optimize(&data_a); + optimize(&data_b); + + create_diffs(&data_a, &data_b) +} + +fn diff_data<'a>(text: &'a str, existing_hashes: &mut HashMap<&'a str, usize>) -> DiffData { + let codes = diff_codes(text, existing_hashes); + let length = codes.len(); + + DiffData { + length, + codes, + modified: vec![false; length + 2], + } +} + +fn diff_codes<'a>(text: &'a str, existing_hashes: &mut HashMap<&'a str, usize>) -> Vec { + let lines: Vec<&str> = text.split('\n').collect(); + + let mut codes = vec![0usize; lines.len()]; + let mut next_code = existing_hashes.len() + 1; + + for i in 0..lines.len() { + let line = lines[i]; + + if !existing_hashes.contains_key(line) { + existing_hashes.insert(line, next_code); + codes[i] = next_code; + next_code += 1; + } else { + codes[i] = existing_hashes[line]; + } + } + + return codes; +} + +// Longest Common-Subsequence +fn lcs(data_a: &mut DiffData, mut lower_a: usize, mut upper_a: usize, data_b: &mut DiffData, mut lower_b: usize, mut upper_b: usize, down_vector: &mut Vec, up_vector: &mut Vec) { + while lower_a < upper_a && lower_b < upper_b && data_a.codes[lower_a] == data_b.codes[lower_b] { + lower_a += 1; + lower_b += 1; + } + + while lower_a < upper_a && lower_b < upper_b && data_a.codes[upper_a - 1] == data_b.codes[upper_b - 1] { + upper_a -= 1; + upper_b -= 1; + } + + if lower_a == upper_a { + // Inserted lines + while lower_b < upper_b { + lower_b += 1; + data_b.modified[lower_b] = true; + } + } else if lower_b == upper_b { + // Deleted lines + while lower_a < upper_a { + lower_a += 1; + data_a.modified[lower_a] = true; + } + } else { + // Find the middle snake and length of an optimal path for A and B + let sms = sms(&data_a, &data_b, down_vector, up_vector); + + // The path is from lower_x to (x, y) and (x, y) to upper_x + lcs(data_a, lower_a, sms.0, data_b, lower_b, sms.1, down_vector, up_vector); + lcs(data_a, sms.1, upper_a, data_b, sms.1, upper_b, down_vector, up_vector); + } +} + +// Shortest Middle Snake +fn sms(data_a: &DiffData, data_b: &DiffData, down_vector: &mut Vec, up_vector: &mut Vec) -> (usize, usize) { + let lower_a = 0usize; + let upper_a = data_a.length; + let lower_b = 0usize; + let upper_b = data_b.length; + + let mut ret = (0usize, 0usize); + let max = data_a.length - data_b.length - 1; + + let down_k = lower_a - lower_b; + let up_k = upper_a - upper_b; + + let delta = (upper_a - lower_a) - (upper_b - lower_b); + let odd_delta = (delta & 1) != 0; + + let down_offset = max - down_k; + let up_offset = max - up_k; + + let max_d = ((upper_a - lower_a + upper_b - lower_b) / 2) + 1; + + down_vector[down_offset + down_k + 1] = lower_a; + up_vector[up_offset + up_k - 1] = upper_a; + + for d in 0..=max_d { + // Extend the forward path + for k in ((down_k - d)..=(down_k + d)).step_by(2) { + let mut x = 0; + let mut y = 0; + if k == down_k - d { + // Down + x = down_vector[down_offset + k + 1]; + } else { + // Right + x = down_vector[down_offset + k - 1]; + if k < down_k + d && down_vector[down_offset + k + 1] >= 1 { + // Down + x = down_vector[down_offset + k + 1]; + } + } + y = x - k; + + // Find the end of the furthest reaching forward D-path in diagonal k. + while x < upper_a && y < upper_b && data_a.codes[x] == data_b.codes[y] { + x += 1; + y += 1; + } + + down_vector[down_offset + k] = x; + + // Overlap ? + if odd_delta && up_k - d < k && k < up_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] { + ret.0 = down_vector[down_offset + k]; + ret.1 = down_vector[down_offset + k] - k; + return ret; + } + } + + // Extend the reverse path + for k in ((up_k - d)..=(up_k + d)).step_by(2) { + let mut x = 0; + let mut y = 0; + + if k == up_k + d { + // Up + x = up_vector[up_offset + k - 1]; + } else { + // Left + x = up_vector[up_offset + k + 1] - 1; + if k > up_k - d && up_vector[up_offset + k - 1] < x { + // Up + x = up_vector[up_offset + k - 1]; + } + } + y = x - k; + + while x > lower_a && y > lower_b && data_a.codes[x - 1] == data_b.codes[y - 1] { + x -= 1; + y -= 1; + } + + up_vector[up_offset + k] = x; + + // Overlap ? + if !odd_delta && down_k - d <= k && k <= down_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] { + ret.0 = down_vector[down_offset + k]; + ret.1 = down_vector[down_offset + k] - k; + return ret; + } + } + } + + panic!("This should not be possible :("); +} + +fn optimize(data: &DiffData) {} + +fn create_diffs(data_a: &DiffData, data_b: &DiffData) {} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index e068d8c..29d526e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,60 +1,66 @@ -use crate::lcs::diff; +use crate::diff::diff; mod matrix; mod lcs; +mod diff; fn main() { - let a = "abcd"; - let b = "abce"; + let a = "abcabba\nlkajsdfasdf\nasdfasdfasdf\nlasjkdf"; + let b = "abcabba\ncbabasdfasdf\nlasjkdf"; - // diff(a, b); - lcs(a, b); + diff(a, b); + // lcs(a, b); } -fn lcs(a: &str, b: &str) { - let n = a.len(); - let m = b.len(); - let max = (n + m) / 2; - let mut v = vec![0usize; max * 2]; - - for d in 0..max { - let mut k = 0usize; - while k <= d * 2 { - let mut x = if k == 0 || k != d * 2 && v[k - 1] < v[k + 1] { - v[k + 1] - } else { - v[k - 1] + 1 - }; - - let mut y = if k < x { - x - k - } else { - 0 - }; - - while x < n && y < m { - let ac = a.chars().nth(x + 1).unwrap(); - let bc = b.chars().nth(y + 1).unwrap(); - - if ac != bc { - break; - } - - x = x + 1; - y = y + 1; - } - - v[k] = x; - if x >= n && y >= m { - println!("Length of a SES is D ({d})"); - dbg!(v); - return; - } - - k += 2; - } - } - - println!("Length of a SES is greater than MAX ({max})"); - dbg!(v); -} +// fn lcs(a: &str, b: &str) { +// let n = a.len() as i32; +// let m = b.len() as i32; +// let max = n + m; +// let mut endpoints = vec![0i32; max as usize * 2]; +// +// for script_length in 0..max { +// let mut k = -script_length; +// while k <= script_length * 2 { +// let index = (k + max) as usize + 1; +// let previous_endpoint = endpoints[index - 1]; +// let next_endpoint = endpoints[index + 1]; +// +// let mut x = if k == -script_length || k != script_length && previous_endpoint < next_endpoint { +// next_endpoint +// } else { +// previous_endpoint + 1 +// }; +// +// let mut y = if k < x { +// x - k +// } else { +// 0 +// }; +// +// // Increase x and y as long as we are in a common sequence between a and b +// while x < n && y < m { +// let ac = a.chars().nth(x as usize).unwrap(); +// let bc = b.chars().nth(y as usize).unwrap(); +// +// if ac != bc { +// break; +// } +// +// x += 1; +// y += 1; +// } +// +// endpoints[index] = x; +// +// // We have traveled through both strings, the length of the shortest edit script (SES) has been found. +// if x >= n && y >= m { +// println!("Length of a SES is D ({d})"); +// return; +// } +// +// k += 2; +// } +// } +// +// println!("Length of a SES is greater than MAX ({max})"); +// }