From 93d460ae5d68d492c306d392276c7ba5f0636e29 Mon Sep 17 00:00:00 2001 From: william Date: Sun, 18 Jun 2023 21:52:49 -0400 Subject: [PATCH] Working LCS/SMS :) --- src/diff.rs | 205 +++++++++++++++++++++++++++++++++++++++------------- src/main.rs | 59 +-------------- 2 files changed, 158 insertions(+), 106 deletions(-) diff --git a/src/diff.rs b/src/diff.rs index e826378..e01fcaf 100644 --- a/src/diff.rs +++ b/src/diff.rs @@ -3,28 +3,71 @@ use std::collections::HashMap; +#[derive(Debug)] +pub struct DiffItem { + start_a: usize, + start_b: usize, + deleted_a: usize, + inserted_b: usize, +} + struct DiffData { length: usize, codes: Vec, modified: Vec, } -pub fn diff(a: &str, b: &str) { +struct SmsData { + x: usize, + y: usize, +} + +struct DiffVec { + data: Vec, +} + +impl DiffVec { + fn get(&self, index: usize) -> usize { + self.data[index] + } + + fn get_i32(&self, index: i32) -> usize { + if index < 0 { + panic!("Got index < 0"); + } + + self.get(index as usize) + } + + fn set(&mut self, index: usize, val: usize) { + self.data[index] = val; + } + + fn set_i32(&mut self, index: i32, val: usize) { + if index < 0 { + panic!("Got index < 0"); + } + + self.set(index as usize, val); + } +} + +pub fn diff(a: &str, b: &str) -> Vec { let mut existing_hashes: HashMap<&str, usize> = HashMap::new(); let mut data_a = diff_data(a, &mut existing_hashes); let mut data_b = diff_data(b, &mut existing_hashes); let max = data_a.length + data_b.length; - let mut down_vector = vec![0usize; 2 * max + 2]; - let mut up_vector = vec![0usize; 2 * max + 2]; + let mut down_vector = DiffVec { data: vec![0usize; 2 * max + 2] }; + let mut up_vector = DiffVec { data: vec![0usize; 2 * max + 2] }; let upper_a = data_a.length; let upper_b = data_b.length; lcs(&mut data_a, 0, upper_a, &mut data_b, 0, upper_b, &mut down_vector, &mut up_vector); - optimize(&data_a); - optimize(&data_b); + optimize(&mut data_a); + optimize(&mut data_b); create_diffs(&data_a, &data_b) } @@ -62,7 +105,7 @@ fn diff_codes<'a>(text: &'a str, existing_hashes: &mut HashMap<&'a str, usize>) } // Longest Common-Subsequence -fn lcs(data_a: &mut DiffData, mut lower_a: usize, mut upper_a: usize, data_b: &mut DiffData, mut lower_b: usize, mut upper_b: usize, down_vector: &mut Vec, up_vector: &mut Vec) { +fn lcs(data_a: &mut DiffData, mut lower_a: usize, mut upper_a: usize, data_b: &mut DiffData, mut lower_b: usize, mut upper_b: usize, down_vector: &mut DiffVec, up_vector: &mut DiffVec) { while lower_a < upper_a && lower_b < upper_b && data_a.codes[lower_a] == data_b.codes[lower_b] { lower_a += 1; lower_b += 1; @@ -76,37 +119,40 @@ fn lcs(data_a: &mut DiffData, mut lower_a: usize, mut upper_a: usize, data_b: &m if lower_a == upper_a { // Inserted lines while lower_b < upper_b { - lower_b += 1; data_b.modified[lower_b] = true; + lower_b += 1; } } else if lower_b == upper_b { // Deleted lines while lower_a < upper_a { - lower_a += 1; data_a.modified[lower_a] = true; + lower_a += 1; } } else { // Find the middle snake and length of an optimal path for A and B - let sms = sms(&data_a, &data_b, down_vector, up_vector); + let sms = sms(&data_a, lower_a, upper_a, &data_b, lower_b, upper_b, down_vector, up_vector); // The path is from lower_x to (x, y) and (x, y) to upper_x - lcs(data_a, lower_a, sms.0, data_b, lower_b, sms.1, down_vector, up_vector); - lcs(data_a, sms.1, upper_a, data_b, sms.1, upper_b, down_vector, up_vector); + lcs(data_a, lower_a, sms.x, data_b, lower_b, sms.y, down_vector, up_vector); + lcs(data_a, sms.x, upper_a, data_b, sms.y, upper_b, down_vector, up_vector); + } +} + +// https://stackoverflow.com/questions/54035728/how-to-add-a-negative-i32-number-to-an-usize-variable +fn add_i32(index: usize, offset: i32) -> usize { + if offset.is_negative() { + index - offset.wrapping_abs() as u32 as usize + } else { + index + offset as usize } } // Shortest Middle Snake -fn sms(data_a: &DiffData, data_b: &DiffData, down_vector: &mut Vec, up_vector: &mut Vec) -> (usize, usize) { - let lower_a = 0usize; - let upper_a = data_a.length; - let lower_b = 0usize; - let upper_b = data_b.length; +fn sms(data_a: &DiffData, lower_a: usize, upper_a: usize, data_b: &DiffData, lower_b: usize, upper_b: usize, down_vector: &mut DiffVec, up_vector: &mut DiffVec) -> SmsData { + let max = upper_a as i32 + upper_b as i32 + 1; - let mut ret = (0usize, 0usize); - let max = data_a.length - data_b.length - 1; - - let down_k = lower_a - lower_b; - let up_k = upper_a - upper_b; + let down_k = lower_a as i32 - lower_b as i32; + let up_k = upper_a as i32 - upper_b as i32; let delta = (upper_a - lower_a) - (upper_b - lower_b); let odd_delta = (delta & 1) != 0; @@ -114,28 +160,28 @@ fn sms(data_a: &DiffData, data_b: &DiffData, down_vector: &mut Vec, up_ve let down_offset = max - down_k; let up_offset = max - up_k; - let max_d = ((upper_a - lower_a + upper_b - lower_b) / 2) + 1; + let max_d = ((upper_a - lower_a + upper_b - lower_b) / 2) as i32 + 1; - down_vector[down_offset + down_k + 1] = lower_a; - up_vector[up_offset + up_k - 1] = upper_a; + down_vector.set_i32(down_offset + down_k + 1, lower_a); + up_vector.set_i32(up_offset + up_k - 1, upper_a); for d in 0..=max_d { // Extend the forward path for k in ((down_k - d)..=(down_k + d)).step_by(2) { - let mut x = 0; - let mut y = 0; + let mut x; + let mut y; if k == down_k - d { // Down - x = down_vector[down_offset + k + 1]; + x = down_vector.get_i32(down_offset + k + 1); } else { // Right - x = down_vector[down_offset + k - 1]; - if k < down_k + d && down_vector[down_offset + k + 1] >= 1 { + x = down_vector.get_i32(down_offset + k - 1) + 1; + if k < down_k + d && down_vector.get_i32(down_offset + k + 1) >= x { // Down - x = down_vector[down_offset + k + 1]; + x = down_vector.get_i32(down_offset + k + 1); } } - y = x - k; + y = add_i32(x, -k); // Find the end of the furthest reaching forward D-path in diagonal k. while x < upper_a && y < upper_b && data_a.codes[x] == data_b.codes[y] { @@ -143,46 +189,46 @@ fn sms(data_a: &DiffData, data_b: &DiffData, down_vector: &mut Vec, up_ve y += 1; } - down_vector[down_offset + k] = x; + down_vector.set_i32(down_offset + k, x); // Overlap ? - if odd_delta && up_k - d < k && k < up_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] { - ret.0 = down_vector[down_offset + k]; - ret.1 = down_vector[down_offset + k] - k; - return ret; + if odd_delta && up_k - d < k && k < up_k + d && up_vector.get_i32(up_offset + k) <= down_vector.get_i32(down_offset + k) { + let x = down_vector.get_i32(down_offset + k); + let y = add_i32(down_vector.get_i32(down_offset + k), -k); + return SmsData { x, y }; } } // Extend the reverse path for k in ((up_k - d)..=(up_k + d)).step_by(2) { - let mut x = 0; - let mut y = 0; + let mut x; + let mut y; if k == up_k + d { // Up - x = up_vector[up_offset + k - 1]; + x = up_vector.get_i32(up_offset + k - 1); } else { // Left - x = up_vector[up_offset + k + 1] - 1; - if k > up_k - d && up_vector[up_offset + k - 1] < x { + x = up_vector.get_i32(up_offset + k + 1) - 1; + if k > up_k - d && up_vector.get_i32(up_offset + k - 1) < x { // Up - x = up_vector[up_offset + k - 1]; + x = up_vector.get_i32(up_offset + k - 1); } } - y = x - k; + y = add_i32(x, -k); while x > lower_a && y > lower_b && data_a.codes[x - 1] == data_b.codes[y - 1] { x -= 1; y -= 1; } - up_vector[up_offset + k] = x; + up_vector.set_i32(up_offset + k, x); // Overlap ? - if !odd_delta && down_k - d <= k && k <= down_k + d && up_vector[up_offset + k] <= down_vector[down_offset + k] { - ret.0 = down_vector[down_offset + k]; - ret.1 = down_vector[down_offset + k] - k; - return ret; + if !odd_delta && down_k - d <= k && k <= down_k + d && up_vector.get_i32(up_offset + k) <= down_vector.get_i32(down_offset + k) { + let x = down_vector.get_i32(down_offset + k); + let y = add_i32(down_vector.get_i32(down_offset + k), -k); + return SmsData { x, y }; } } } @@ -190,6 +236,65 @@ fn sms(data_a: &DiffData, data_b: &DiffData, down_vector: &mut Vec, up_ve panic!("This should not be possible :("); } -fn optimize(data: &DiffData) {} +fn optimize(data: &mut DiffData) { + let mut start_pos = 0usize; + let mut end_pos; -fn create_diffs(data_a: &DiffData, data_b: &DiffData) {} \ No newline at end of file + while start_pos < data.length { + while start_pos < data.length && !data.modified[start_pos] { + start_pos += 1; + } + end_pos = start_pos; + while end_pos < data.length && data.modified[end_pos] { + end_pos += 1; + } + + if end_pos < data.length && data.codes[start_pos] == data.codes[end_pos] { + data.modified[start_pos] = false; + data.modified[end_pos] = true; + } else { + start_pos = end_pos; + } + } +} + +fn create_diffs(data_a: &DiffData, data_b: &DiffData) -> Vec { + let mut result = Vec::new(); + + let mut start_a; + let mut start_b; + let mut line_a = 0usize; + let mut line_b = 0usize; + + while line_a < data_a.length || line_b < data_b.length { + if line_a < data_a.length && !data_a.modified[line_a] && + line_b < data_b.length && !data_b.modified[line_b] { + // Equal line + line_a += 1; + line_b += 1; + } else { + start_a = line_a; + start_b = line_b; + + while line_a < data_a.length && (line_b >= data_b.length || data_a.modified[line_a]) { + line_a += 1; + } + + while line_b < data_b.length && (line_a >= data_a.length || data_b.modified[line_b]) { + line_b += 1; + } + + if start_a < line_a || start_b < line_b { + let item = DiffItem { + start_a, + start_b, + deleted_a: line_a - start_a, + inserted_b: line_b - start_b, + }; + result.push(item); + } + } + } + + result +} diff --git a/src/main.rs b/src/main.rs index 29d526e..c410a89 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,61 +6,8 @@ mod diff; fn main() { let a = "abcabba\nlkajsdfasdf\nasdfasdfasdf\nlasjkdf"; - let b = "abcabba\ncbabasdfasdf\nlasjkdf"; + let b = "abcabba\ncbabasdfasdf\nlasjkdf\nope"; - diff(a, b); - // lcs(a, b); + let result = diff(a, b); + dbg!(result); } - -// fn lcs(a: &str, b: &str) { -// let n = a.len() as i32; -// let m = b.len() as i32; -// let max = n + m; -// let mut endpoints = vec![0i32; max as usize * 2]; -// -// for script_length in 0..max { -// let mut k = -script_length; -// while k <= script_length * 2 { -// let index = (k + max) as usize + 1; -// let previous_endpoint = endpoints[index - 1]; -// let next_endpoint = endpoints[index + 1]; -// -// let mut x = if k == -script_length || k != script_length && previous_endpoint < next_endpoint { -// next_endpoint -// } else { -// previous_endpoint + 1 -// }; -// -// let mut y = if k < x { -// x - k -// } else { -// 0 -// }; -// -// // Increase x and y as long as we are in a common sequence between a and b -// while x < n && y < m { -// let ac = a.chars().nth(x as usize).unwrap(); -// let bc = b.chars().nth(y as usize).unwrap(); -// -// if ac != bc { -// break; -// } -// -// x += 1; -// y += 1; -// } -// -// endpoints[index] = x; -// -// // We have traveled through both strings, the length of the shortest edit script (SES) has been found. -// if x >= n && y >= m { -// println!("Length of a SES is D ({d})"); -// return; -// } -// -// k += 2; -// } -// } -// -// println!("Length of a SES is greater than MAX ({max})"); -// }