466 lines
12 KiB
Lua
466 lines
12 KiB
Lua
|
---------
|
||
|
-- Source and binary equivalency comparisons
|
||
|
--
|
||
|
-- **Notes:**
|
||
|
--
|
||
|
-- * Intended as an extra safety check for mission-critical code,
|
||
|
-- should give affirmative results if everything works.
|
||
|
-- * Heavy on load() and string.dump(), which may be slowish,
|
||
|
-- and may cause problems for cross-compiled applications.
|
||
|
-- * Optional detailed information dump is mainly for debugging,
|
||
|
-- reason being, if the two are not equivalent when they should be,
|
||
|
-- then some form of optimization has failed.
|
||
|
-- * source: IMPORTANT: TK_NAME not compared if opt-locals enabled.
|
||
|
-- * binary: IMPORTANT: Some shortcuts are taken with int and size_t
|
||
|
-- value reading -- if the functions break, then the binary chunk
|
||
|
-- is very large indeed.
|
||
|
-- * binary: There is a lack of diagnostic information when a compare
|
||
|
-- fails; you can use ChunkSpy and compare using visual diff.
|
||
|
----
|
||
|
local byte = string.byte
|
||
|
local dump = string.dump
|
||
|
local load = loadstring or load --luacheck: ignore 113
|
||
|
local sub = string.sub
|
||
|
|
||
|
local M = {}
|
||
|
|
||
|
local is_realtoken = { -- significant (grammar) tokens
|
||
|
TK_KEYWORD = true,
|
||
|
TK_NAME = true,
|
||
|
TK_NUMBER = true,
|
||
|
TK_STRING = true,
|
||
|
TK_LSTRING = true,
|
||
|
TK_OP = true,
|
||
|
TK_EOS = true,
|
||
|
}
|
||
|
|
||
|
local option, llex, warn
|
||
|
|
||
|
|
||
|
--- The initialization function.
|
||
|
--
|
||
|
-- @tparam {[string]=bool,...} _option
|
||
|
-- @tparam luasrcdiet.llex _llex
|
||
|
-- @tparam table _warn
|
||
|
function M.init(_option, _llex, _warn)
|
||
|
option = _option
|
||
|
llex = _llex
|
||
|
warn = _warn
|
||
|
end
|
||
|
|
||
|
--- Builds lists containing a 'normal' lexer stream.
|
||
|
--
|
||
|
-- @tparam string s The source code.
|
||
|
-- @treturn table
|
||
|
-- @treturn table
|
||
|
local function build_stream(s)
|
||
|
local stok, sseminfo = llex.lex(s) -- source list (with whitespace elements)
|
||
|
local tok, seminfo -- processed list (real elements only)
|
||
|
= {}, {}
|
||
|
for i = 1, #stok do
|
||
|
local t = stok[i]
|
||
|
if is_realtoken[t] then
|
||
|
tok[#tok + 1] = t
|
||
|
seminfo[#seminfo + 1] = sseminfo[i]
|
||
|
end
|
||
|
end--for
|
||
|
return tok, seminfo
|
||
|
end
|
||
|
|
||
|
-- Tests source (lexer stream) equivalence.
|
||
|
--
|
||
|
-- @tparam string z
|
||
|
-- @tparam string dat
|
||
|
function M.source(z, dat)
|
||
|
|
||
|
-- Returns a dumped string for seminfo compares.
|
||
|
local function dumpsem(s)
|
||
|
local sf = load("return "..s, "z")
|
||
|
if sf then
|
||
|
return dump(sf)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
-- Marks and optionally reports non-equivalence.
|
||
|
local function bork(msg)
|
||
|
if option.DETAILS then print("SRCEQUIV: "..msg) end
|
||
|
warn.SRC_EQUIV = true
|
||
|
end
|
||
|
|
||
|
-- Get lexer streams for both source strings, compare.
|
||
|
local tok1, seminfo1 = build_stream(z) -- original
|
||
|
local tok2, seminfo2 = build_stream(dat) -- compressed
|
||
|
|
||
|
-- Compare shbang lines ignoring EOL.
|
||
|
local sh1 = z:match("^(#[^\r\n]*)")
|
||
|
local sh2 = dat:match("^(#[^\r\n]*)")
|
||
|
if sh1 or sh2 then
|
||
|
if not sh1 or not sh2 or sh1 ~= sh2 then
|
||
|
bork("shbang lines different")
|
||
|
end
|
||
|
end
|
||
|
|
||
|
-- Compare by simple count.
|
||
|
if #tok1 ~= #tok2 then
|
||
|
bork("count "..#tok1.." "..#tok2)
|
||
|
return
|
||
|
end
|
||
|
|
||
|
-- Compare each element the best we can.
|
||
|
for i = 1, #tok1 do
|
||
|
local t1, t2 = tok1[i], tok2[i]
|
||
|
local s1, s2 = seminfo1[i], seminfo2[i]
|
||
|
if t1 ~= t2 then -- by type
|
||
|
bork("type ["..i.."] "..t1.." "..t2)
|
||
|
break
|
||
|
end
|
||
|
if t1 == "TK_KEYWORD" or t1 == "TK_NAME" or t1 == "TK_OP" then
|
||
|
if t1 == "TK_NAME" and option["opt-locals"] then
|
||
|
-- can't compare identifiers of locals that are optimized
|
||
|
elseif s1 ~= s2 then -- by semantic info (simple)
|
||
|
bork("seminfo ["..i.."] "..t1.." "..s1.." "..s2)
|
||
|
break
|
||
|
end
|
||
|
elseif t1 == "TK_EOS" then
|
||
|
-- no seminfo to compare
|
||
|
else-- "TK_NUMBER" or "TK_STRING" or "TK_LSTRING"
|
||
|
-- compare 'binary' form, so dump a function
|
||
|
local s1b,s2b = dumpsem(s1), dumpsem(s2)
|
||
|
if not s1b or not s2b or s1b ~= s2b then
|
||
|
bork("seminfo ["..i.."] "..t1.." "..s1.." "..s2)
|
||
|
break
|
||
|
end
|
||
|
end
|
||
|
end--for
|
||
|
|
||
|
-- Successful comparison if end is reached with no borks.
|
||
|
end
|
||
|
|
||
|
--- Tests binary chunk equivalence (only for PUC Lua 5.1).
|
||
|
--
|
||
|
-- @tparam string z
|
||
|
-- @tparam string dat
|
||
|
function M.binary(z, dat)
|
||
|
local TNIL = 0 --luacheck: ignore 211
|
||
|
local TBOOLEAN = 1
|
||
|
local TNUMBER = 3
|
||
|
local TSTRING = 4
|
||
|
|
||
|
-- sizes of data types
|
||
|
local endian
|
||
|
local sz_int
|
||
|
local sz_sizet
|
||
|
local sz_inst
|
||
|
local sz_number
|
||
|
local getint
|
||
|
local getsizet
|
||
|
|
||
|
-- Marks and optionally reports non-equivalence.
|
||
|
local function bork(msg)
|
||
|
if option.DETAILS then print("BINEQUIV: "..msg) end
|
||
|
warn.BIN_EQUIV = true
|
||
|
end
|
||
|
|
||
|
-- Checks if bytes exist.
|
||
|
local function ensure(c, sz)
|
||
|
if c.i + sz - 1 > c.len then return end
|
||
|
return true
|
||
|
end
|
||
|
|
||
|
-- Skips some bytes.
|
||
|
local function skip(c, sz)
|
||
|
if not sz then sz = 1 end
|
||
|
c.i = c.i + sz
|
||
|
end
|
||
|
|
||
|
-- Returns a byte value.
|
||
|
local function getbyte(c)
|
||
|
local i = c.i
|
||
|
if i > c.len then return end
|
||
|
local d = sub(c.dat, i, i)
|
||
|
c.i = i + 1
|
||
|
return byte(d)
|
||
|
end
|
||
|
|
||
|
-- Return an int value (little-endian).
|
||
|
local function getint_l(c)
|
||
|
local n, scale = 0, 1
|
||
|
if not ensure(c, sz_int) then return end
|
||
|
for _ = 1, sz_int do
|
||
|
n = n + scale * getbyte(c)
|
||
|
scale = scale * 256
|
||
|
end
|
||
|
return n
|
||
|
end
|
||
|
|
||
|
-- Returns an int value (big-endian).
|
||
|
local function getint_b(c)
|
||
|
local n = 0
|
||
|
if not ensure(c, sz_int) then return end
|
||
|
for _ = 1, sz_int do
|
||
|
n = n * 256 + getbyte(c)
|
||
|
end
|
||
|
return n
|
||
|
end
|
||
|
|
||
|
-- Returns a size_t value (little-endian).
|
||
|
local function getsizet_l(c)
|
||
|
local n, scale = 0, 1
|
||
|
if not ensure(c, sz_sizet) then return end
|
||
|
for _ = 1, sz_sizet do
|
||
|
n = n + scale * getbyte(c)
|
||
|
scale = scale * 256
|
||
|
end
|
||
|
return n
|
||
|
end
|
||
|
|
||
|
-- Returns a size_t value (big-endian).
|
||
|
local function getsizet_b(c)
|
||
|
local n = 0
|
||
|
if not ensure(c, sz_sizet) then return end
|
||
|
for _ = 1, sz_sizet do
|
||
|
n = n * 256 + getbyte(c)
|
||
|
end
|
||
|
return n
|
||
|
end
|
||
|
|
||
|
-- Returns a block (as a string).
|
||
|
local function getblock(c, sz)
|
||
|
local i = c.i
|
||
|
local j = i + sz - 1
|
||
|
if j > c.len then return end
|
||
|
local d = sub(c.dat, i, j)
|
||
|
c.i = i + sz
|
||
|
return d
|
||
|
end
|
||
|
|
||
|
-- Returns a string.
|
||
|
local function getstring(c)
|
||
|
local n = getsizet(c)
|
||
|
if not n then return end
|
||
|
if n == 0 then return "" end
|
||
|
return getblock(c, n)
|
||
|
end
|
||
|
|
||
|
-- Compares byte value.
|
||
|
local function goodbyte(c1, c2)
|
||
|
local b1, b2 = getbyte(c1), getbyte(c2)
|
||
|
if not b1 or not b2 or b1 ~= b2 then
|
||
|
return
|
||
|
end
|
||
|
return b1
|
||
|
end
|
||
|
|
||
|
-- Compares byte value.
|
||
|
local function badbyte(c1, c2)
|
||
|
local b = goodbyte(c1, c2)
|
||
|
if not b then return true end
|
||
|
end
|
||
|
|
||
|
-- Compares int value.
|
||
|
local function goodint(c1, c2)
|
||
|
local i1, i2 = getint(c1), getint(c2)
|
||
|
if not i1 or not i2 or i1 ~= i2 then
|
||
|
return
|
||
|
end
|
||
|
return i1
|
||
|
end
|
||
|
|
||
|
-- Recursively-called function to compare function prototypes.
|
||
|
local function getfunc(c1, c2)
|
||
|
-- source name (ignored)
|
||
|
if not getstring(c1) or not getstring(c2) then
|
||
|
bork("bad source name"); return
|
||
|
end
|
||
|
-- linedefined (ignored)
|
||
|
if not getint(c1) or not getint(c2) then
|
||
|
bork("bad linedefined"); return
|
||
|
end
|
||
|
-- lastlinedefined (ignored)
|
||
|
if not getint(c1) or not getint(c2) then
|
||
|
bork("bad lastlinedefined"); return
|
||
|
end
|
||
|
if not (ensure(c1, 4) and ensure(c2, 4)) then
|
||
|
bork("prototype header broken")
|
||
|
end
|
||
|
-- nups (compared)
|
||
|
if badbyte(c1, c2) then
|
||
|
bork("bad nups"); return
|
||
|
end
|
||
|
-- numparams (compared)
|
||
|
if badbyte(c1, c2) then
|
||
|
bork("bad numparams"); return
|
||
|
end
|
||
|
-- is_vararg (compared)
|
||
|
if badbyte(c1, c2) then
|
||
|
bork("bad is_vararg"); return
|
||
|
end
|
||
|
-- maxstacksize (compared)
|
||
|
if badbyte(c1, c2) then
|
||
|
bork("bad maxstacksize"); return
|
||
|
end
|
||
|
-- code (compared)
|
||
|
local ncode = goodint(c1, c2)
|
||
|
if not ncode then
|
||
|
bork("bad ncode"); return
|
||
|
end
|
||
|
local code1 = getblock(c1, ncode * sz_inst)
|
||
|
local code2 = getblock(c2, ncode * sz_inst)
|
||
|
if not code1 or not code2 or code1 ~= code2 then
|
||
|
bork("bad code block"); return
|
||
|
end
|
||
|
-- constants (compared)
|
||
|
local nconst = goodint(c1, c2)
|
||
|
if not nconst then
|
||
|
bork("bad nconst"); return
|
||
|
end
|
||
|
for _ = 1, nconst do
|
||
|
local ctype = goodbyte(c1, c2)
|
||
|
if not ctype then
|
||
|
bork("bad const type"); return
|
||
|
end
|
||
|
if ctype == TBOOLEAN then
|
||
|
if badbyte(c1, c2) then
|
||
|
bork("bad boolean value"); return
|
||
|
end
|
||
|
elseif ctype == TNUMBER then
|
||
|
local num1 = getblock(c1, sz_number)
|
||
|
local num2 = getblock(c2, sz_number)
|
||
|
if not num1 or not num2 or num1 ~= num2 then
|
||
|
bork("bad number value"); return
|
||
|
end
|
||
|
elseif ctype == TSTRING then
|
||
|
local str1 = getstring(c1)
|
||
|
local str2 = getstring(c2)
|
||
|
if not str1 or not str2 or str1 ~= str2 then
|
||
|
bork("bad string value"); return
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
-- prototypes (compared recursively)
|
||
|
local nproto = goodint(c1, c2)
|
||
|
if not nproto then
|
||
|
bork("bad nproto"); return
|
||
|
end
|
||
|
for _ = 1, nproto do
|
||
|
if not getfunc(c1, c2) then
|
||
|
bork("bad function prototype"); return
|
||
|
end
|
||
|
end
|
||
|
-- debug information (ignored)
|
||
|
-- lineinfo (ignored)
|
||
|
local sizelineinfo1 = getint(c1)
|
||
|
if not sizelineinfo1 then
|
||
|
bork("bad sizelineinfo1"); return
|
||
|
end
|
||
|
local sizelineinfo2 = getint(c2)
|
||
|
if not sizelineinfo2 then
|
||
|
bork("bad sizelineinfo2"); return
|
||
|
end
|
||
|
if not getblock(c1, sizelineinfo1 * sz_int) then
|
||
|
bork("bad lineinfo1"); return
|
||
|
end
|
||
|
if not getblock(c2, sizelineinfo2 * sz_int) then
|
||
|
bork("bad lineinfo2"); return
|
||
|
end
|
||
|
-- locvars (ignored)
|
||
|
local sizelocvars1 = getint(c1)
|
||
|
if not sizelocvars1 then
|
||
|
bork("bad sizelocvars1"); return
|
||
|
end
|
||
|
local sizelocvars2 = getint(c2)
|
||
|
if not sizelocvars2 then
|
||
|
bork("bad sizelocvars2"); return
|
||
|
end
|
||
|
for _ = 1, sizelocvars1 do
|
||
|
if not getstring(c1) or not getint(c1) or not getint(c1) then
|
||
|
bork("bad locvars1"); return
|
||
|
end
|
||
|
end
|
||
|
for _ = 1, sizelocvars2 do
|
||
|
if not getstring(c2) or not getint(c2) or not getint(c2) then
|
||
|
bork("bad locvars2"); return
|
||
|
end
|
||
|
end
|
||
|
-- upvalues (ignored)
|
||
|
local sizeupvalues1 = getint(c1)
|
||
|
if not sizeupvalues1 then
|
||
|
bork("bad sizeupvalues1"); return
|
||
|
end
|
||
|
local sizeupvalues2 = getint(c2)
|
||
|
if not sizeupvalues2 then
|
||
|
bork("bad sizeupvalues2"); return
|
||
|
end
|
||
|
for _ = 1, sizeupvalues1 do
|
||
|
if not getstring(c1) then bork("bad upvalues1"); return end
|
||
|
end
|
||
|
for _ = 1, sizeupvalues2 do
|
||
|
if not getstring(c2) then bork("bad upvalues2"); return end
|
||
|
end
|
||
|
return true
|
||
|
end
|
||
|
|
||
|
-- Removes shbang line so that load runs.
|
||
|
local function zap_shbang(s)
|
||
|
local shbang = s:match("^(#[^\r\n]*\r?\n?)")
|
||
|
if shbang then -- cut out shbang
|
||
|
s = sub(s, #shbang + 1)
|
||
|
end
|
||
|
return s
|
||
|
end
|
||
|
|
||
|
-- Attempt to compile, then dump to get binary chunk string.
|
||
|
local cz = load(zap_shbang(z), "z")
|
||
|
if not cz then
|
||
|
bork("failed to compile original sources for binary chunk comparison")
|
||
|
return
|
||
|
end
|
||
|
|
||
|
local cdat = load(zap_shbang(dat), "z")
|
||
|
if not cdat then
|
||
|
bork("failed to compile compressed result for binary chunk comparison")
|
||
|
end
|
||
|
|
||
|
-- if load() works, dump assuming string.dump() is error-free
|
||
|
local c1 = { i = 1, dat = dump(cz) }
|
||
|
c1.len = #c1.dat
|
||
|
|
||
|
local c2 = { i = 1, dat = dump(cdat) }
|
||
|
c2.len = #c2.dat
|
||
|
|
||
|
-- Parse binary chunks to verify equivalence.
|
||
|
-- * For headers, handle sizes to allow a degree of flexibility.
|
||
|
-- * Assume a valid binary chunk is generated, since it was not
|
||
|
-- generated via external means.
|
||
|
if not (ensure(c1, 12) and ensure(c2, 12)) then
|
||
|
bork("header broken")
|
||
|
end
|
||
|
skip(c1, 6) -- skip signature(4), version, format
|
||
|
endian = getbyte(c1) -- 1 = little endian
|
||
|
sz_int = getbyte(c1) -- get data type sizes
|
||
|
sz_sizet = getbyte(c1)
|
||
|
sz_inst = getbyte(c1)
|
||
|
sz_number = getbyte(c1)
|
||
|
skip(c1) -- skip integral flag
|
||
|
skip(c2, 12) -- skip other header (assume similar)
|
||
|
|
||
|
if endian == 1 then -- set for endian sensitive data we need
|
||
|
getint = getint_l
|
||
|
getsizet = getsizet_l
|
||
|
else
|
||
|
getint = getint_b
|
||
|
getsizet = getsizet_b
|
||
|
end
|
||
|
getfunc(c1, c2) -- get prototype at root
|
||
|
|
||
|
if c1.i ~= c1.len + 1 then
|
||
|
bork("inconsistent binary chunk1"); return
|
||
|
elseif c2.i ~= c2.len + 1 then
|
||
|
bork("inconsistent binary chunk2"); return
|
||
|
end
|
||
|
|
||
|
-- Successful comparison if end is reached with no borks.
|
||
|
end
|
||
|
|
||
|
return M
|