--------- -- This module does lexer-based optimizations. -- -- **Notes:** -- -- * TODO: General string delimiter conversion optimizer. -- * TODO: (numbers) warn if overly significant digit. ---- local char = string.char local find = string.find local match = string.match local rep = string.rep local sub = string.sub local tonumber = tonumber local tostring = tostring local print -- set in optimize() local M = {} -- error function, can override by setting own function into module M.error = error M.warn = {} -- table for warning flags local stoks, sinfos, stoklns -- source lists local is_realtoken = { -- significant (grammar) tokens TK_KEYWORD = true, TK_NAME = true, TK_NUMBER = true, TK_STRING = true, TK_LSTRING = true, TK_OP = true, TK_EOS = true, } local is_faketoken = { -- whitespace (non-grammar) tokens TK_COMMENT = true, TK_LCOMMENT = true, TK_EOL = true, TK_SPACE = true, } local opt_details -- for extra information --- Returns true if current token is at the start of a line. -- -- It skips over deleted tokens via recursion. -- -- @tparam int i -- @treturn bool local function atlinestart(i) local tok = stoks[i - 1] if i <= 1 or tok == "TK_EOL" then return true elseif tok == "" then return atlinestart(i - 1) end return false end --- Returns true if current token is at the end of a line. -- -- It skips over deleted tokens via recursion. -- -- @tparam int i -- @treturn bool local function atlineend(i) local tok = stoks[i + 1] if i >= #stoks or tok == "TK_EOL" or tok == "TK_EOS" then return true elseif tok == "" then return atlineend(i + 1) end return false end --- Counts comment EOLs inside a long comment. -- -- In order to keep line numbering, EOLs need to be reinserted. -- -- @tparam string lcomment -- @treturn int local function commenteols(lcomment) local sep = #match(lcomment, "^%-%-%[=*%[") local z = sub(lcomment, sep + 1, -(sep - 1)) -- remove delims local i, c = 1, 0 while true do local p, _, r, s = find(z, "([\r\n])([\r\n]?)", i) if not p then break end -- if no matches, done i = p + 1 c = c + 1 if #s > 0 and r ~= s then -- skip CRLF or LFCR i = i + 1 end end return c end --- Compares two tokens (i, j) and returns the whitespace required. -- -- See documentation for a reference table of interactions. -- -- Only two grammar/real tokens are being considered: -- -- * if `""`, no separation is needed, -- * if `" "`, then at least one whitespace (or EOL) is required. -- -- Note: This doesn't work at the start or the end or for EOS! -- -- @tparam int i -- @tparam int j -- @treturn string local function checkpair(i, j) local t1, t2 = stoks[i], stoks[j] if t1 == "TK_STRING" or t1 == "TK_LSTRING" or t2 == "TK_STRING" or t2 == "TK_LSTRING" then return "" elseif t1 == "TK_OP" or t2 == "TK_OP" then if (t1 == "TK_OP" and (t2 == "TK_KEYWORD" or t2 == "TK_NAME")) or (t2 == "TK_OP" and (t1 == "TK_KEYWORD" or t1 == "TK_NAME")) then return "" end if t1 == "TK_OP" and t2 == "TK_OP" then -- for TK_OP/TK_OP pairs, see notes in technotes.txt local op, op2 = sinfos[i], sinfos[j] if (match(op, "^%.%.?$") and match(op2, "^%.")) or (match(op, "^[~=<>]$") and op2 == "=") or (op == "[" and (op2 == "[" or op2 == "=")) then return " " end return "" end -- "TK_OP" + "TK_NUMBER" case local op = sinfos[i] if t2 == "TK_OP" then op = sinfos[j] end if match(op, "^%.%.?%.?$") then return " " end return "" else-- "TK_KEYWORD" | "TK_NAME" | "TK_NUMBER" then return " " end end --- Repack tokens, removing deletions caused by optimization process. local function repack_tokens() local dtoks, dinfos, dtoklns = {}, {}, {} local j = 1 for i = 1, #stoks do local tok = stoks[i] if tok ~= "" then dtoks[j], dinfos[j], dtoklns[j] = tok, sinfos[i], stoklns[i] j = j + 1 end end stoks, sinfos, stoklns = dtoks, dinfos, dtoklns end --- Does number optimization. -- -- Optimization using string formatting functions is one way of doing this, -- but here, we consider all cases and handle them separately (possibly an -- idiotic approach...). -- -- Scientific notation being generated is not in canonical form, this may or -- may not be a bad thing. -- -- Note: Intermediate portions need to fit into a normal number range. -- -- Optimizations can be divided based on number patterns: -- -- * hexadecimal: -- (1) no need to remove leading zeros, just skip to (2) -- (2) convert to integer if size equal or smaller -- * change if equal size -> lose the 'x' to reduce entropy -- (3) number is then processed as an integer -- (4) note: does not make 0[xX] consistent -- * integer: -- (1) reduce useless fractional part, if present, e.g. 123.000 -> 123. -- (2) remove leading zeros, e.g. 000123 -- * float: -- (1) split into digits dot digits -- (2) if no integer portion, take as zero (can omit later) -- (3) handle degenerate .000 case, after which the fractional part -- must be non-zero (if zero, it's matched as float .0) -- (4) remove trailing zeros for fractional portion -- (5) p.q where p > 0 and q > 0 cannot be shortened any more -- (6) otherwise p == 0 and the form is .q, e.g. .000123 -- (7) if scientific shorter, convert, e.g. .000123 -> 123e-6 -- * scientific: -- (1) split into (digits dot digits) [eE] ([+-] digits) -- (2) if significand is zero, just use .0 -- (3) remove leading zeros for significand -- (4) shift out trailing zeros for significand -- (5) examine exponent and determine which format is best: -- number with fraction, or scientific -- -- Note: Number with fraction and scientific number is never converted -- to integer, because Lua 5.3 distinguishes between integers and floats. -- -- -- @tparam int i local function do_number(i) local before = sinfos[i] -- 'before' local z = before -- working representation local y -- 'after', if better -------------------------------------------------------------------- if match(z, "^0[xX]") then -- hexadecimal number local v = tostring(tonumber(z)) if #v <= #z then z = v -- change to integer, AND continue else return -- no change; stick to hex end end if match(z, "^%d+$") then -- integer if tonumber(z) > 0 then y = match(z, "^0*([1-9]%d*)$") -- remove leading zeros else y = "0" -- basic zero end elseif not match(z, "[eE]") then -- float local p, q = match(z, "^(%d*)%.(%d*)$") -- split if p == "" then p = 0 end -- int part zero if q == "" then q = "0" end -- fraction part zero if tonumber(q) == 0 and p == 0 then y = ".0" -- degenerate .000 to .0 else -- now, q > 0 holds and p is a number local zeros_cnt = #match(q, "0*$") -- remove trailing zeros if zeros_cnt > 0 then q = sub(q, 1, #q - zeros_cnt) end -- if p > 0, nothing else we can do to simplify p.q case if tonumber(p) > 0 then y = p.."."..q else y = "."..q -- tentative, e.g. .000123 local v = #match(q, "^0*") -- # leading spaces local w = #q - v -- # significant digits local nv = tostring(#q) -- e.g. compare 123e-6 versus .000123 if w + 2 + #nv < 1 + #q then y = sub(q, -w).."e-"..nv end end end else -- scientific number local sig, ex = match(z, "^([^eE]+)[eE]([%+%-]?%d+)$") ex = tonumber(ex) -- if got ".", shift out fractional portion of significand local p, q = match(sig, "^(%d*)%.(%d*)$") if p then ex = ex - #q sig = p..q end if tonumber(sig) == 0 then y = ".0" -- basic float zero else local v = #match(sig, "^0*") -- remove leading zeros sig = sub(sig, v + 1) v = #match(sig, "0*$") -- shift out trailing zeros if v > 0 then sig = sub(sig, 1, #sig - v) ex = ex + v end -- examine exponent and determine which format is best local nex = tostring(ex) if ex >= 0 and (ex <= 1 + #nex) then -- a float y = sig..rep("0", ex).."." elseif ex < 0 and (ex >= -#sig) then -- fraction, e.g. .123 v = #sig + ex y = sub(sig, 1, v).."."..sub(sig, v + 1) elseif ex < 0 and (#nex >= -ex - #sig) then -- e.g. compare 1234e-5 versus .01234 -- gives: #sig + 1 + #nex >= 1 + (-ex - #sig) + #sig -- -> #nex >= -ex - #sig v = -ex - #sig y = "."..rep("0", v)..sig else -- non-canonical scientific representation y = sig.."e"..ex end end--if sig end if y and y ~= sinfos[i] then if opt_details then print(" (line "..stoklns[i]..") "..sinfos[i].." -> "..y) opt_details = opt_details + 1 end sinfos[i] = y end end --- Does string optimization. -- -- Note: It works on well-formed strings only! -- -- Optimizations on characters can be summarized as follows: -- -- \a\b\f\n\r\t\v -- no change -- \\ -- no change -- \"\' -- depends on delim, other can remove \ -- \[\] -- remove \ -- \ -- general escape, remove \ (Lua 5.1 only) -- \ -- normalize the EOL only -- \ddd -- if \a\b\f\n\r\t\v, change to latter -- if other < ascii 32, keep ddd but zap leading zeros -- but cannot have following digits -- if >= ascii 32, translate it into the literal, then also -- do escapes for \\,\",\' cases -- -- no change -- -- Switch delimiters if string becomes shorter. -- -- @tparam int I local function do_string(I) local info = sinfos[I] local delim = sub(info, 1, 1) -- delimiter used local ndelim = (delim == "'") and '"' or "'" -- opposite " <-> ' local z = sub(info, 2, -2) -- actual string local i = 1 local c_delim, c_ndelim = 0, 0 -- "/' counts while i <= #z do local c = sub(z, i, i) if c == "\\" then -- escaped stuff local j = i + 1 local d = sub(z, j, j) local p = find("abfnrtv\\\n\r\"\'0123456789", d, 1, true) if not p then -- \ -- remove \ (Lua 5.1 only) z = sub(z, 1, i - 1)..sub(z, j) i = i + 1 elseif p <= 8 then -- \a\b\f\n\r\t\v\\ i = i + 2 -- no change elseif p <= 10 then -- \ -- normalize EOL local eol = sub(z, j, j + 1) if eol == "\r\n" or eol == "\n\r" then z = sub(z, 1, i).."\n"..sub(z, j + 2) elseif p == 10 then -- \r case z = sub(z, 1, i).."\n"..sub(z, j + 1) end i = i + 2 elseif p <= 12 then -- \"\' -- remove \ for ndelim if d == delim then c_delim = c_delim + 1 i = i + 2 else c_ndelim = c_ndelim + 1 z = sub(z, 1, i - 1)..sub(z, j) i = i + 1 end else -- \ddd -- various steps local s = match(z, "^(%d%d?%d?)", j) j = i + 1 + #s -- skip to location local cv = tonumber(s) local cc = char(cv) p = find("\a\b\f\n\r\t\v", cc, 1, true) if p then -- special escapes s = "\\"..sub("abfnrtv", p, p) elseif cv < 32 then -- normalized \ddd if match(sub(z, j, j), "%d") then -- if a digit follows, \ddd cannot be shortened s = "\\"..s else s = "\\"..cv end elseif cc == delim then -- \ s = "\\"..cc c_delim = c_delim + 1 elseif cc == "\\" then -- \\ s = "\\\\" else -- literal character s = cc if cc == ndelim then c_ndelim = c_ndelim + 1 end end z = sub(z, 1, i - 1)..s..sub(z, j) i = i + #s end--if p else-- c ~= "\\" -- -- no change i = i + 1 if c == ndelim then -- count ndelim, for switching delimiters c_ndelim = c_ndelim + 1 end end--if c end--while -- Switching delimiters, a long-winded derivation: -- (1) delim takes 2+2*c_delim bytes, ndelim takes c_ndelim bytes -- (2) delim becomes c_delim bytes, ndelim becomes 2+2*c_ndelim bytes -- simplifying the condition (1)>(2) --> c_delim > c_ndelim if c_delim > c_ndelim then i = 1 while i <= #z do local p, _, r = find(z, "([\'\"])", i) if not p then break end if r == delim then -- \ -> z = sub(z, 1, p - 2)..sub(z, p) i = p else-- r == ndelim -- -> \ z = sub(z, 1, p - 1).."\\"..sub(z, p) i = p + 2 end end--while delim = ndelim -- actually change delimiters end z = delim..z..delim if z ~= sinfos[I] then if opt_details then print(" (line "..stoklns[I]..") "..sinfos[I].." -> "..z) opt_details = opt_details + 1 end sinfos[I] = z end end --- Does long string optimization. -- -- * remove first optional newline -- * normalize embedded newlines -- * reduce '=' separators in delimiters if possible -- -- Note: warning flagged if trailing whitespace found, not trimmed. -- -- @tparam int I local function do_lstring(I) local info = sinfos[I] local delim1 = match(info, "^%[=*%[") -- cut out delimiters local sep = #delim1 local delim2 = sub(info, -sep, -1) local z = sub(info, sep + 1, -(sep + 1)) -- lstring without delims local y = "" local i = 1 while true do local p, _, r, s = find(z, "([\r\n])([\r\n]?)", i) -- deal with a single line local ln if not p then ln = sub(z, i) elseif p >= i then ln = sub(z, i, p - 1) end if ln ~= "" then -- flag a warning if there are trailing spaces, won't optimize! if match(ln, "%s+$") then M.warn.LSTRING = "trailing whitespace in long string near line "..stoklns[I] end y = y..ln end if not p then -- done if no more EOLs break end -- deal with line endings, normalize them i = p + 1 if p then if #s > 0 and r ~= s then -- skip CRLF or LFCR i = i + 1 end -- skip first newline, which can be safely deleted if not(i == 1 and i == p) then y = y.."\n" end end end--while -- handle possible deletion of one or more '=' separators if sep >= 3 then local chk, okay = sep - 1 -- loop to test ending delimiter with less of '=' down to zero while chk >= 2 do local delim = "%]"..rep("=", chk - 2).."%]" if not match(y.."]", delim) then okay = chk end chk = chk - 1 end if okay then -- change delimiters sep = rep("=", okay - 2) delim1, delim2 = "["..sep.."[", "]"..sep.."]" end end sinfos[I] = delim1..y..delim2 end --- Does long comment optimization. -- -- * trim trailing whitespace -- * normalize embedded newlines -- * reduce '=' separators in delimiters if possible -- -- Note: It does not remove first optional newline. -- -- @tparam int I local function do_lcomment(I) local info = sinfos[I] local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters local sep = #delim1 local delim2 = sub(info, -(sep - 2), -1) local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims local y = "" local i = 1 while true do local p, _, r, s = find(z, "([\r\n])([\r\n]?)", i) -- deal with a single line, extract and check trailing whitespace local ln if not p then ln = sub(z, i) elseif p >= i then ln = sub(z, i, p - 1) end if ln ~= "" then -- trim trailing whitespace if non-empty line local ws = match(ln, "%s*$") if #ws > 0 then ln = sub(ln, 1, -(ws + 1)) end y = y..ln end if not p then -- done if no more EOLs break end -- deal with line endings, normalize them i = p + 1 if p then if #s > 0 and r ~= s then -- skip CRLF or LFCR i = i + 1 end y = y.."\n" end end--while -- handle possible deletion of one or more '=' separators sep = sep - 2 if sep >= 3 then local chk, okay = sep - 1 -- loop to test ending delimiter with less of '=' down to zero while chk >= 2 do local delim = "%]"..rep("=", chk - 2).."%]" if not match(y, delim) then okay = chk end chk = chk - 1 end if okay then -- change delimiters sep = rep("=", okay - 2) delim1, delim2 = "--["..sep.."[", "]"..sep.."]" end end sinfos[I] = delim1..y..delim2 end --- Does short comment optimization. -- -- * trim trailing whitespace -- -- @tparam int i local function do_comment(i) local info = sinfos[i] local ws = match(info, "%s*$") -- just look from end of string if #ws > 0 then info = sub(info, 1, -(ws + 1)) -- trim trailing whitespace end sinfos[i] = info end --- Returns true if string found in long comment. -- -- This is a feature to keep copyright or license texts. -- -- @tparam bool opt_keep -- @tparam string info -- @treturn bool local function keep_lcomment(opt_keep, info) if not opt_keep then return false end -- option not set local delim1 = match(info, "^%-%-%[=*%[") -- cut out delimiters local sep = #delim1 local z = sub(info, sep + 1, -(sep - 1)) -- comment without delims if find(z, opt_keep, 1, true) then -- try to match return true end end --- The main entry point. -- -- * currently, lexer processing has 2 passes -- * processing is done on a line-oriented basis, which is easier to -- grok due to the next point... -- * since there are various options that can be enabled or disabled, -- processing is a little messy or convoluted -- -- @tparam {[string]=bool,...} option -- @tparam {string,...} toklist -- @tparam {string,...} semlist -- @tparam {int,...} toklnlist -- @treturn {string,...} toklist -- @treturn {string,...} semlist -- @treturn {int,...} toklnlist function M.optimize(option, toklist, semlist, toklnlist) -- Set option flags. local opt_comments = option["opt-comments"] local opt_whitespace = option["opt-whitespace"] local opt_emptylines = option["opt-emptylines"] local opt_eols = option["opt-eols"] local opt_strings = option["opt-strings"] local opt_numbers = option["opt-numbers"] local opt_x = option["opt-experimental"] local opt_keep = option.KEEP opt_details = option.DETAILS and 0 -- upvalues for details display print = M.print or _G.print if opt_eols then -- forced settings, otherwise won't work properly opt_comments = true opt_whitespace = true opt_emptylines = true elseif opt_x then opt_whitespace = true end -- Variable initialization. stoks, sinfos, stoklns -- set source lists = toklist, semlist, toklnlist local i = 1 -- token position local tok, info -- current token local prev -- position of last grammar token -- on same line (for TK_SPACE stuff) -- Changes a token, info pair. local function settoken(tok, info, I) --luacheck: ignore 431 I = I or i stoks[I] = tok or "" sinfos[I] = info or "" end -- Experimental optimization for ';' operator. if opt_x then while true do tok, info = stoks[i], sinfos[i] if tok == "TK_EOS" then -- end of stream/pass break elseif tok == "TK_OP" and info == ";" then -- ';' operator found, since it is entirely optional, set it -- as a space to let whitespace optimization do the rest settoken("TK_SPACE", " ") end i = i + 1 end repack_tokens() end -- Processing loop (PASS 1) i = 1 while true do tok, info = stoks[i], sinfos[i] local atstart = atlinestart(i) -- set line begin flag if atstart then prev = nil end if tok == "TK_EOS" then -- end of stream/pass break elseif tok == "TK_KEYWORD" or -- keywords, identifiers, tok == "TK_NAME" or -- operators tok == "TK_OP" then -- TK_KEYWORD and TK_OP can't be optimized without a big -- optimization framework; it would be more of an optimizing -- compiler, not a source code compressor -- TK_NAME that are locals needs parser to analyze/optimize prev = i elseif tok == "TK_NUMBER" then -- numbers if opt_numbers then do_number(i) -- optimize end prev = i elseif tok == "TK_STRING" or -- strings, long strings tok == "TK_LSTRING" then if opt_strings then if tok == "TK_STRING" then do_string(i) -- optimize else do_lstring(i) -- optimize end end prev = i elseif tok == "TK_COMMENT" then -- short comments if opt_comments then if i == 1 and sub(info, 1, 1) == "#" then -- keep shbang comment, trim whitespace do_comment(i) else -- safe to delete, as a TK_EOL (or TK_EOS) always follows settoken() -- remove entirely end elseif opt_whitespace then -- trim whitespace only do_comment(i) end elseif tok == "TK_LCOMMENT" then -- long comments if keep_lcomment(opt_keep, info) then -- if --keep, we keep a long comment if is found; -- this is a feature to keep copyright or license texts if opt_whitespace then -- trim whitespace only do_lcomment(i) end prev = i elseif opt_comments then local eols = commenteols(info) -- prepare opt_emptylines case first, if a disposable token -- follows, current one is safe to dump, else keep a space; -- it is implied that the operation is safe for '-', because -- current is a TK_LCOMMENT, and must be separate from a '-' if is_faketoken[stoks[i + 1]] then settoken() -- remove entirely tok = "" else settoken("TK_SPACE", " ") end -- if there are embedded EOLs to keep and opt_emptylines is -- disabled, then switch the token into one or more EOLs if not opt_emptylines and eols > 0 then settoken("TK_EOL", rep("\n", eols)) end -- if optimizing whitespaces, force reinterpretation of the -- token to give a chance for the space to be optimized away if opt_whitespace and tok ~= "" then i = i - 1 -- to reinterpret end else -- disabled case if opt_whitespace then -- trim whitespace only do_lcomment(i) end prev = i end elseif tok == "TK_EOL" then -- line endings if atstart and opt_emptylines then settoken() -- remove entirely elseif info == "\r\n" or info == "\n\r" then -- normalize the rest of the EOLs for CRLF/LFCR only -- (note that TK_LCOMMENT can change into several EOLs) settoken("TK_EOL", "\n") end elseif tok == "TK_SPACE" then -- whitespace if opt_whitespace then if atstart or atlineend(i) then -- delete leading and trailing whitespace settoken() -- remove entirely else -- at this point, since leading whitespace have been removed, -- there should be a either a real token or a TK_LCOMMENT -- prior to hitting this whitespace; the TK_LCOMMENT case -- only happens if opt_comments is disabled; so prev ~= nil local ptok = stoks[prev] if ptok == "TK_LCOMMENT" then -- previous TK_LCOMMENT can abut with anything settoken() -- remove entirely else -- prev must be a grammar token; consecutive TK_SPACE -- tokens is impossible when optimizing whitespace local ntok = stoks[i + 1] if is_faketoken[ntok] then -- handle special case where a '-' cannot abut with -- either a short comment or a long comment if (ntok == "TK_COMMENT" or ntok == "TK_LCOMMENT") and ptok == "TK_OP" and sinfos[prev] == "-" then -- keep token else settoken() -- remove entirely end else--is_realtoken -- check a pair of grammar tokens, if can abut, then -- delete space token entirely, otherwise keep one space local s = checkpair(prev, i + 1) if s == "" then settoken() -- remove entirely else settoken("TK_SPACE", " ") end end end end end else error("unidentified token encountered") end i = i + 1 end--while repack_tokens() -- Processing loop (PASS 2) if opt_eols then i = 1 -- Aggressive EOL removal only works with most non-grammar tokens -- optimized away because it is a rather simple scheme -- basically -- it just checks 'real' token pairs around EOLs. if stoks[1] == "TK_COMMENT" then -- first comment still existing must be shbang, skip whole line i = 3 end while true do tok = stoks[i] if tok == "TK_EOS" then -- end of stream/pass break elseif tok == "TK_EOL" then -- consider each TK_EOL local t1, t2 = stoks[i - 1], stoks[i + 1] if is_realtoken[t1] and is_realtoken[t2] then -- sanity check local s = checkpair(i - 1, i + 1) if s == "" or t2 == "TK_EOS" then settoken() -- remove entirely end end end--if tok i = i + 1 end--while repack_tokens() end if opt_details and opt_details > 0 then print() end -- spacing return stoks, sinfos, stoklns end return M