--!strict
--[[
Based on DiffMatchPatch by Neil Fraser.
https://github.com/google/diff-match-patch
]]
export type DiffAction = number
export type Diff = { actionType: DiffAction, value: string }
export type Diffs = { Diff }
local StringDiff = {
ActionTypes = table.freeze({
Equal = 0,
Delete = 1,
Insert = 2,
}),
}
function StringDiff.findDiffs(text1: string, text2: string): Diffs
-- Validate inputs
if type(text1) ~= "string" or type(text2) ~= "string" then
error(
string.format(
"Invalid inputs to StringDiff.findDiffs, expected strings and got (%s, %s)",
type(text1),
type(text2)
),
2
)
end
-- Shortcut if the texts are identical
if text1 == text2 then
return { { actionType = StringDiff.ActionTypes.Equal, value = text1 } }
end
-- Trim off any shared prefix and suffix
-- These are easy to detect and can be dealt with quickly without needing a complex diff
-- and later we simply add them as Equal to the start and end of the diff
local sharedPrefix, sharedSuffix
local prefixLength = StringDiff._sharedPrefix(text1, text2)
if prefixLength > 0 then
-- Store the prefix
sharedPrefix = string.sub(text1, 1, prefixLength)
-- Now trim it off
text1 = string.sub(text1, prefixLength + 1)
text2 = string.sub(text2, prefixLength + 1)
end
local suffixLength = StringDiff._sharedSuffix(text1, text2)
if suffixLength > 0 then
-- Store the suffix
sharedSuffix = string.sub(text1, -suffixLength)
-- Now trim it off
text1 = string.sub(text1, 1, -suffixLength - 1)
text2 = string.sub(text2, 1, -suffixLength - 1)
end
-- Compute the diff on the middle block where the changes lie
local diffs = StringDiff._computeDiff(text1, text2)
-- Restore the prefix and suffix
if sharedPrefix then
table.insert(diffs, 1, { actionType = StringDiff.ActionTypes.Equal, value = sharedPrefix })
end
if sharedSuffix then
table.insert(diffs, { actionType = StringDiff.ActionTypes.Equal, value = sharedSuffix })
end
-- Cleanup the diff
diffs = StringDiff._cleanupSemantic(diffs)
diffs = StringDiff._reorderAndMerge(diffs)
-- Remove any empty diffs
local cursor = 1
while cursor and diffs[cursor] do
if diffs[cursor].value == "" then
table.remove(diffs, cursor)
else
cursor += 1
end
end
return diffs
end
function StringDiff._computeDiff(text1: string, text2: string): Diffs
-- Assumes that the prefix and suffix have already been trimmed off
-- and shortcut returns have been made so these texts must be different
local text1Length, text2Length = #text1, #text2
if text1Length == 0 then
-- It's simply inserting all of text2 into text1
return { { actionType = StringDiff.ActionTypes.Insert, value = text2 } }
end
if text2Length == 0 then
-- It's simply deleting all of text1
return { { actionType = StringDiff.ActionTypes.Delete, value = text1 } }
end
local longText = if text1Length > text2Length then text1 else text2
local shortText = if text1Length > text2Length then text2 else text1
local shortTextLength = #shortText
-- Shortcut if the shorter string exists entirely inside the longer one
local indexOf = if shortTextLength == 0 then nil else string.find(longText, shortText, 1, true)
if indexOf ~= nil then
local diffs = {
{ actionType = StringDiff.ActionTypes.Insert, value = string.sub(longText, 1, indexOf - 1) },
{ actionType = StringDiff.ActionTypes.Equal, value = shortText },
{ actionType = StringDiff.ActionTypes.Insert, value = string.sub(longText, indexOf + shortTextLength) },
}
-- Swap insertions for deletions if diff is reversed
if text1Length > text2Length then
diffs[1].actionType, diffs[3].actionType = StringDiff.ActionTypes.Delete, StringDiff.ActionTypes.Delete
end
return diffs
end
if shortTextLength == 1 then
-- Single character string
-- After the previous shortcut, the character can't be an equality
return {
{ actionType = StringDiff.ActionTypes.Delete, value = text1 },
{ actionType = StringDiff.ActionTypes.Insert, value = text2 },
}
end
return StringDiff._bisect(text1, text2)
end
function StringDiff._cleanupSemantic(diffs: Diffs): Diffs
-- Reduce the number of edits by eliminating semantically trivial equalities.
local changes = false
local equalities = {} -- Stack of indices where equalities are found.
local equalitiesLength = 0 -- Keeping our own length var is faster.
local lastEquality: string? = nil
-- Always equal to diffs[equalities[equalitiesLength]].value
local pointer = 1 -- Index of current position.
-- Number of characters that changed prior to the equality.
local length_insertions1 = 0
local length_deletions1 = 0
-- Number of characters that changed after the equality.
local length_insertions2 = 0
local length_deletions2 = 0
while diffs[pointer] do
if diffs[pointer].actionType == StringDiff.ActionTypes.Equal then -- Equality found.
equalitiesLength = equalitiesLength + 1
equalities[equalitiesLength] = pointer
length_insertions1 = length_insertions2
length_deletions1 = length_deletions2
length_insertions2 = 0
length_deletions2 = 0
lastEquality = diffs[pointer].value
else -- An insertion or deletion.
if diffs[pointer].actionType == StringDiff.ActionTypes.Insert then
length_insertions2 = length_insertions2 + #diffs[pointer].value
else
length_deletions2 = length_deletions2 + #diffs[pointer].value
end
-- Eliminate an equality that is smaller or equal to the edits on both
-- sides of it.
if
lastEquality
and (#lastEquality <= math.max(length_insertions1, length_deletions1))
and (#lastEquality <= math.max(length_insertions2, length_deletions2))
then
-- Duplicate record.
table.insert(
diffs,
equalities[equalitiesLength],
{ actionType = StringDiff.ActionTypes.Delete, value = lastEquality }
)
-- Change second copy to insert.
diffs[equalities[equalitiesLength] + 1].actionType = StringDiff.ActionTypes.Insert
-- Throw away the equality we just deleted.
equalitiesLength = equalitiesLength - 1
-- Throw away the previous equality (it needs to be reevaluated).
equalitiesLength = equalitiesLength - 1
pointer = (equalitiesLength > 0) and equalities[equalitiesLength] or 0
length_insertions1, length_deletions1 = 0, 0 -- Reset the counters.
length_insertions2, length_deletions2 = 0, 0
lastEquality = nil
changes = true
end
end
pointer = pointer + 1
end
-- Normalize the diff.
if changes then
StringDiff._reorderAndMerge(diffs)
end
StringDiff._cleanupSemanticLossless(diffs)
-- Find any overlaps between deletions and insertions.
-- e.g: abcxxxxxxdef
-- -> abcxxxdef
-- e.g: xxxabcdefxxx
-- -> defxxxabc
-- Only extract an overlap if it is as big as the edit ahead or behind it.
pointer = 2
while diffs[pointer] do
if
diffs[pointer - 1].actionType == StringDiff.ActionTypes.Delete
and diffs[pointer].actionType == StringDiff.ActionTypes.Insert
then
local deletion = diffs[pointer - 1].value
local insertion = diffs[pointer].value
local overlap_length1 = StringDiff._commonOverlap(deletion, insertion)
local overlap_length2 = StringDiff._commonOverlap(insertion, deletion)
if overlap_length1 >= overlap_length2 then
if overlap_length1 >= #deletion / 2 or overlap_length1 >= #insertion / 2 then
-- Overlap found. Insert an equality and trim the surrounding edits.
table.insert(
diffs,
pointer,
{ actionType = StringDiff.ActionTypes.Equal, value = string.sub(insertion, 1, overlap_length1) }
)
diffs[pointer - 1].value = string.sub(deletion, 1, #deletion - overlap_length1)
diffs[pointer + 1].value = string.sub(insertion, overlap_length1 + 1)
pointer = pointer + 1
end
else
if overlap_length2 >= #deletion / 2 or overlap_length2 >= #insertion / 2 then
-- Reverse overlap found.
-- Insert an equality and swap and trim the surrounding edits.
table.insert(
diffs,
pointer,
{ actionType = StringDiff.ActionTypes.Equal, value = string.sub(deletion, 1, overlap_length2) }
)
diffs[pointer - 1] = {
actionType = StringDiff.ActionTypes.Insert,
value = string.sub(insertion, 1, #insertion - overlap_length2),
}
diffs[pointer + 1] = {
actionType = StringDiff.ActionTypes.Delete,
value = string.sub(deletion, overlap_length2 + 1),
}
pointer = pointer + 1
end
end
pointer = pointer + 1
end
pointer = pointer + 1
end
return diffs
end
function StringDiff._sharedPrefix(text1: string, text2: string): number
-- Uses a binary search to find the largest common prefix between the two strings
-- Performance analysis: http://neil.fraser.name/news/2007/10/09/
-- Shortcut common cases
if (#text1 == 0) or (#text2 == 0) or (string.byte(text1, 1) ~= string.byte(text2, 1)) then
return 0
end
local pointerMin = 1
local pointerMax = math.min(#text1, #text2)
local pointerMid = pointerMax
local pointerStart = 1
while pointerMin < pointerMid do
if string.sub(text1, pointerStart, pointerMid) == string.sub(text2, pointerStart, pointerMid) then
pointerMin = pointerMid
pointerStart = pointerMin
else
pointerMax = pointerMid
end
pointerMid = math.floor(pointerMin + (pointerMax - pointerMin) / 2)
end
return pointerMid
end
function StringDiff._sharedSuffix(text1: string, text2: string): number
-- Uses a binary search to find the largest common suffix between the two strings
-- Performance analysis: http://neil.fraser.name/news/2007/10/09/
-- Shortcut common cases
if (#text1 == 0) or (#text2 == 0) or (string.byte(text1, -1) ~= string.byte(text2, -1)) then
return 0
end
local pointerMin = 1
local pointerMax = math.min(#text1, #text2)
local pointerMid = pointerMax
local pointerEnd = 1
while pointerMin < pointerMid do
if string.sub(text1, -pointerMid, -pointerEnd) == string.sub(text2, -pointerMid, -pointerEnd) then
pointerMin = pointerMid
pointerEnd = pointerMin
else
pointerMax = pointerMid
end
pointerMid = math.floor(pointerMin + (pointerMax - pointerMin) / 2)
end
return pointerMid
end
function StringDiff._commonOverlap(text1: string, text2: string): number
-- Determine if the suffix of one string is the prefix of another.
-- Cache the text lengths to prevent multiple calls.
local text1_length = #text1
local text2_length = #text2
-- Eliminate the null case.
if text1_length == 0 or text2_length == 0 then
return 0
end
-- Truncate the longer string.
if text1_length > text2_length then
text1 = string.sub(text1, text1_length - text2_length + 1)
elseif text1_length < text2_length then
text2 = string.sub(text2, 1, text1_length)
end
local text_length = math.min(text1_length, text2_length)
-- Quick check for the worst case.
if text1 == text2 then
return text_length
end
-- Start by looking for a single character match
-- and increase length until no match is found.
-- Performance analysis: https://neil.fraser.name/news/2010/11/04/
local best = 0
local length = 1
while true do
local pattern = string.sub(text1, text_length - length + 1)
local found = string.find(text2, pattern, 1, true)
if found == nil then
return best
end
length = length + found - 1
if found == 1 or string.sub(text1, text_length - length + 1) == string.sub(text2, 1, length) then
best = length
length = length + 1
end
end
end
function StringDiff._cleanupSemanticScore(one: string, two: string): number
-- Given two strings, compute a score representing whether the internal
-- boundary falls on logical boundaries.
-- Scores range from 6 (best) to 0 (worst).
if (#one == 0) or (#two == 0) then
-- Edges are the best.
return 6
end
-- Each port of this function behaves slightly differently due to
-- subtle differences in each language's definition of things like
-- 'whitespace'. Since this function's purpose is largely cosmetic,
-- the choice has been made to use each language's native features
-- rather than force total conformity.
local char1 = string.sub(one, -1)
local char2 = string.sub(two, 1, 1)
local nonAlphaNumeric1 = string.match(char1, "%W")
local nonAlphaNumeric2 = string.match(char2, "%W")
local whitespace1 = nonAlphaNumeric1 and string.match(char1, "%s")
local whitespace2 = nonAlphaNumeric2 and string.match(char2, "%s")
local lineBreak1 = whitespace1 and string.match(char1, "%c")
local lineBreak2 = whitespace2 and string.match(char2, "%c")
local blankLine1 = lineBreak1 and string.match(one, "\n\r?\n$")
local blankLine2 = lineBreak2 and string.match(two, "^\r?\n\r?\n")
if blankLine1 or blankLine2 then
-- Five points for blank lines.
return 5
elseif lineBreak1 or lineBreak2 then
-- Four points for line breaks
-- DEVIATION: Prefer to start on a line break instead of end on it
return if lineBreak1 then 4 else 4.5
elseif nonAlphaNumeric1 and not whitespace1 and whitespace2 then
-- Three points for end of sentences.
return 3
elseif whitespace1 or whitespace2 then
-- Two points for whitespace.
return 2
elseif nonAlphaNumeric1 or nonAlphaNumeric2 then
-- One point for non-alphanumeric.
return 1
end
return 0
end
function StringDiff._cleanupSemanticLossless(diffs: Diffs)
-- Look for single edits surrounded on both sides by equalities
-- which can be shifted sideways to align the edit to a word boundary.
-- e.g: The cat came. -> The cat came.
local pointer = 2
-- Intentionally ignore the first and last element (don't need checking).
while diffs[pointer + 1] do
local prevDiff, nextDiff = diffs[pointer - 1], diffs[pointer + 1]
if
(prevDiff.actionType == StringDiff.ActionTypes.Equal)
and (nextDiff.actionType == StringDiff.ActionTypes.Equal)
then
-- This is a single edit surrounded by equalities.
local diff = diffs[pointer]
local equality1 = prevDiff.value
local edit = diff.value
local equality2 = nextDiff.value
-- First, shift the edit as far left as possible.
local commonOffset = StringDiff._sharedSuffix(equality1, edit)
if commonOffset > 0 then
local commonString = string.sub(edit, -commonOffset)
equality1 = string.sub(equality1, 1, -commonOffset - 1)
edit = commonString .. string.sub(edit, 1, -commonOffset - 1)
equality2 = commonString .. equality2
end
-- Second, step character by character right, looking for the best fit.
local bestEquality1 = equality1
local bestEdit = edit
local bestEquality2 = equality2
local bestScore = StringDiff._cleanupSemanticScore(equality1, edit)
+ StringDiff._cleanupSemanticScore(edit, equality2)
while string.byte(edit, 1) == string.byte(equality2, 1) do
equality1 = equality1 .. string.sub(edit, 1, 1)
edit = string.sub(edit, 2) .. string.sub(equality2, 1, 1)
equality2 = string.sub(equality2, 2)
local score = StringDiff._cleanupSemanticScore(equality1, edit)
+ StringDiff._cleanupSemanticScore(edit, equality2)
-- The > (rather than >=) encourages leading rather than trailing whitespace on edits.
-- I just think it looks better for indentation changes to start the line,
-- since then indenting several lines all have aligned diffs at the start
if score > bestScore then
bestScore = score
bestEquality1 = equality1
bestEdit = edit
bestEquality2 = equality2
end
end
if prevDiff.value ~= bestEquality1 then
-- We have an improvement, save it back to the diff.
if #bestEquality1 > 0 then
diffs[pointer - 1].value = bestEquality1
else
table.remove(diffs, pointer - 1)
pointer = pointer - 1
end
diffs[pointer].value = bestEdit
if #bestEquality2 > 0 then
diffs[pointer + 1].value = bestEquality2
else
table.remove(diffs, pointer + 1)
pointer = pointer - 1
end
end
end
pointer = pointer + 1
end
end
function StringDiff._bisect(text1: string, text2: string): Diffs
-- Find the 'middle snake' of a diff, split the problem in two
-- and return the recursively constructed diff
-- See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations
-- Cache the text lengths to prevent multiple calls
local text1Length = #text1
local text2Length = #text2
local _sub, _element
local maxD = math.ceil((text1Length + text2Length) / 2)
local vOffset = maxD
local vLength = 2 * maxD
local v1 = table.create(vLength)
local v2 = table.create(vLength)
-- Setting all elements to -1 is faster in Lua than mixing integers and nil
for x = 0, vLength - 1 do
v1[x] = -1
v2[x] = -1
end
v1[vOffset + 1] = 0
v2[vOffset + 1] = 0
local delta = text1Length - text2Length
-- If the total number of characters is odd, then
-- the front path will collide with the reverse path
local front = (delta % 2 ~= 0)
-- Offsets for start and end of k loop
-- Prevents mapping of space beyond the grid
local k1Start = 0
local k1End = 0
local k2Start = 0
local k2End = 0
for d = 0, maxD - 1 do
-- Walk the front path one step
for k1 = -d + k1Start, d - k1End, 2 do
local k1_offset = vOffset + k1
local x1
if (k1 == -d) or ((k1 ~= d) and (v1[k1_offset - 1] < v1[k1_offset + 1])) then
x1 = v1[k1_offset + 1]
else
x1 = v1[k1_offset - 1] + 1
end
local y1 = x1 - k1
while
(x1 <= text1Length)
and (y1 <= text2Length)
and (string.sub(text1, x1, x1) == string.sub(text2, y1, y1))
do
x1 = x1 + 1
y1 = y1 + 1
end
v1[k1_offset] = x1
if x1 > text1Length + 1 then
-- Ran off the right of the graph
k1End = k1End + 2
elseif y1 > text2Length + 1 then
-- Ran off the bottom of the graph
k1Start = k1Start + 2
elseif front then
local k2_offset = vOffset + delta - k1
if k2_offset >= 0 and k2_offset < vLength and v2[k2_offset] ~= -1 then
-- Mirror x2 onto top-left coordinate system
local x2 = text1Length - v2[k2_offset] + 1
if x1 > x2 then
-- Overlap detected
return StringDiff._bisectSplit(text1, text2, x1, y1)
end
end
end
end
-- Walk the reverse path one step
for k2 = -d + k2Start, d - k2End, 2 do
local k2_offset = vOffset + k2
local x2
if (k2 == -d) or ((k2 ~= d) and (v2[k2_offset - 1] < v2[k2_offset + 1])) then
x2 = v2[k2_offset + 1]
else
x2 = v2[k2_offset - 1] + 1
end
local y2 = x2 - k2
while
(x2 <= text1Length)
and (y2 <= text2Length)
and (string.sub(text1, -x2, -x2) == string.sub(text2, -y2, -y2))
do
x2 = x2 + 1
y2 = y2 + 1
end
v2[k2_offset] = x2
if x2 > text1Length + 1 then
-- Ran off the left of the graph
k2End = k2End + 2
elseif y2 > text2Length + 1 then
-- Ran off the top of the graph
k2Start = k2Start + 2
elseif not front then
local k1_offset = vOffset + delta - k2
if k1_offset >= 0 and k1_offset < vLength and v1[k1_offset] ~= -1 then
local x1 = v1[k1_offset]
local y1 = vOffset + x1 - k1_offset
-- Mirror x2 onto top-left coordinate system
x2 = text1Length - x2 + 1
if x1 > x2 then
-- Overlap detected
return StringDiff._bisectSplit(text1, text2, x1, y1)
end
end
end
end
end
-- Number of diffs equals number of characters, no commonality at all
return {
{ actionType = StringDiff.ActionTypes.Delete, value = text1 },
{ actionType = StringDiff.ActionTypes.Insert, value = text2 },
}
end
function StringDiff._bisectSplit(text1: string, text2: string, x: number, y: number): Diffs
-- Given the location of the 'middle snake',
-- split the diff in two parts and recurse
local text1a = string.sub(text1, 1, x - 1)
local text2a = string.sub(text2, 1, y - 1)
local text1b = string.sub(text1, x)
local text2b = string.sub(text2, y)
-- Compute both diffs serially
local diffs = StringDiff.findDiffs(text1a, text2a)
local diffsB = StringDiff.findDiffs(text1b, text2b)
-- Merge diffs
table.move(diffsB, 1, #diffsB, #diffs + 1, diffs)
return diffs
end
function StringDiff._reorderAndMerge(diffs: Diffs): Diffs
-- Reorder and merge like edit sections and merge equalities
-- Any edit section can move as long as it doesn't cross an equality
-- Add a dummy entry at the end
table.insert(diffs, { actionType = StringDiff.ActionTypes.Equal, value = "" })
local pointer = 1
local countDelete, countInsert = 0, 0
local textDelete, textInsert = "", ""
local commonLength
while diffs[pointer] do
local actionType = diffs[pointer].actionType
if actionType == StringDiff.ActionTypes.Insert then
countInsert = countInsert + 1
textInsert = textInsert .. diffs[pointer].value
pointer = pointer + 1
elseif actionType == StringDiff.ActionTypes.Delete then
countDelete = countDelete + 1
textDelete = textDelete .. diffs[pointer].value
pointer = pointer + 1
elseif actionType == StringDiff.ActionTypes.Equal then
-- Upon reaching an equality, check for prior redundancies
if countDelete + countInsert > 1 then
if (countDelete > 0) and (countInsert > 0) then
-- Factor out any common prefixies
commonLength = StringDiff._sharedPrefix(textInsert, textDelete)
if commonLength > 0 then
local back_pointer = pointer - countDelete - countInsert
if
(back_pointer > 1) and (diffs[back_pointer - 1].actionType == StringDiff.ActionTypes.Equal)
then
diffs[back_pointer - 1].value = diffs[back_pointer - 1].value
.. string.sub(textInsert, 1, commonLength)
else
table.insert(diffs, 1, {
actionType = StringDiff.ActionTypes.Equal,
value = string.sub(textInsert, 1, commonLength),
})
pointer = pointer + 1
end
textInsert = string.sub(textInsert, commonLength + 1)
textDelete = string.sub(textDelete, commonLength + 1)
end
-- Factor out any common suffixies
commonLength = StringDiff._sharedSuffix(textInsert, textDelete)
if commonLength ~= 0 then
diffs[pointer].value = string.sub(textInsert, -commonLength) .. diffs[pointer].value
textInsert = string.sub(textInsert, 1, -commonLength - 1)
textDelete = string.sub(textDelete, 1, -commonLength - 1)
end
end
-- Delete the offending records and add the merged ones
pointer = pointer - countDelete - countInsert
for _ = 1, countDelete + countInsert do
table.remove(diffs, pointer)
end
if #textDelete > 0 then
table.insert(diffs, pointer, { actionType = StringDiff.ActionTypes.Delete, value = textDelete })
pointer = pointer + 1
end
if #textInsert > 0 then
table.insert(diffs, pointer, { actionType = StringDiff.ActionTypes.Insert, value = textInsert })
pointer = pointer + 1
end
pointer = pointer + 1
elseif (pointer > 1) and (diffs[pointer - 1].actionType == StringDiff.ActionTypes.Equal) then
-- Merge this equality with the previous one
diffs[pointer - 1].value = diffs[pointer - 1].value .. diffs[pointer].value
table.remove(diffs, pointer)
else
pointer = pointer + 1
end
countInsert, countDelete = 0, 0
textDelete, textInsert = "", ""
end
end
if diffs[#diffs].value == "" then
-- Remove the dummy entry at the end
diffs[#diffs] = nil
end
-- Second pass: look for single edits surrounded on both sides by equalities
-- which can be shifted sideways to eliminate an equality
-- e.g: ABAC -> ABAC
local changes = false
pointer = 2
-- Intentionally ignore the first and last element (don't need checking)
while pointer < #diffs do
local prevDiff, nextDiff = diffs[pointer - 1], diffs[pointer + 1]
if
(prevDiff.actionType == StringDiff.ActionTypes.Equal)
and (nextDiff.actionType == StringDiff.ActionTypes.Equal)
then
-- This is a single edit surrounded by equalities
local currentDiff = diffs[pointer]
local currentText = currentDiff.value
local prevText = prevDiff.value
local nextText = nextDiff.value
if #prevText == 0 then
table.remove(diffs, pointer - 1)
changes = true
elseif string.sub(currentText, -#prevText) == prevText then
-- Shift the edit over the previous equality
currentDiff.value = prevText .. string.sub(currentText, 1, -#prevText - 1)
nextDiff.value = prevText .. nextDiff.value
table.remove(diffs, pointer - 1)
changes = true
elseif string.sub(currentText, 1, #nextText) == nextText then
-- Shift the edit over the next equality
prevDiff.value = prevText .. nextText
currentDiff.value = string.sub(currentText, #nextText + 1) .. nextText
table.remove(diffs, pointer + 1)
changes = true
end
end
pointer = pointer + 1
end
-- If shifts were made, the diffs need reordering and another shift sweep
if changes then
return StringDiff._reorderAndMerge(diffs)
end
return diffs
end
return StringDiff