--------- -- Lua 5.1+ lexical analyzer written in Lua. -- -- This file is part of LuaSrcDiet, based on Yueliang material. -- -- **Notes:** -- -- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0, -- with significant modifications to handle LuaSrcDiet's needs: -- (1) llex.error is an optional error function handler, -- (2) seminfo for strings include their delimiters and no -- translation operations are performed on them. -- * ADDED shbang handling has been added to support executable scripts. -- * NO localized decimal point replacement magic. -- * NO limit to number of lines. -- * NO support for compatible long strings (LUA\_COMPAT_LSTR). -- * Added goto keyword and double-colon operator (Lua 5.2+). ---- local find = string.find local fmt = string.format local match = string.match local sub = string.sub local tonumber = tonumber local M = {} local kw = {} for v in ([[ and break do else elseif end false for function goto if in local nil not or repeat return then true until while]]):gmatch("%S+") do kw[v] = true end local z, -- source stream sourceid, -- name of source I, -- position of lexer buff, -- buffer for strings ln, -- line number tok, -- lexed token list seminfo, -- lexed semantic information list tokln -- line numbers for messages --- Adds information to token listing. -- -- @tparam string token -- @tparam string info local function addtoken(token, info) local i = #tok + 1 tok[i] = token seminfo[i] = info tokln[i] = ln end --- Handles line number incrementation and end-of-line characters. -- -- @tparam int i Position of lexer in the source stream. -- @tparam bool is_tok -- @treturn int local function inclinenumber(i, is_tok) local old = sub(z, i, i) i = i + 1 -- skip '\n' or '\r' local c = sub(z, i, i) if (c == "\n" or c == "\r") and (c ~= old) then i = i + 1 -- skip '\n\r' or '\r\n' old = old..c end if is_tok then addtoken("TK_EOL", old) end ln = ln + 1 I = i return i end --- Returns a chunk name or id, no truncation for long names. -- -- @treturn string local function chunkid() if sourceid and match(sourceid, "^[=@]") then return sub(sourceid, 2) -- remove first char end return "[string]" end --- Formats error message and throws error. -- -- A simplified version, does not report what token was responsible. -- -- @tparam string s -- @tparam int line The line number. -- @raise local function errorline(s, line) local e = M.error or error e(fmt("%s:%d: %s", chunkid(), line or ln, s)) end --- Counts separators (`="` in a long string delimiter. -- -- @tparam int i Position of lexer in the source stream. -- @treturn int local function skip_sep(i) local s = sub(z, i, i) i = i + 1 local count = #match(z, "=*", i) i = i + count I = i return (sub(z, i, i) == s) and count or (-count) - 1 end --- Reads a long string or long comment. -- -- @tparam bool is_str -- @tparam string sep -- @treturn string -- @raise if unfinished long string or comment. local function read_long_string(is_str, sep) local i = I + 1 -- skip 2nd '[' local c = sub(z, i, i) if c == "\r" or c == "\n" then -- string starts with a newline? i = inclinenumber(i) -- skip it end while true do local p, _, r = find(z, "([\r\n%]])", i) -- (long range match) if not p then errorline(is_str and "unfinished long string" or "unfinished long comment") end i = p if r == "]" then -- delimiter test if skip_sep(i) == sep then buff = sub(z, buff, I) I = I + 1 -- skip 2nd ']' return buff end i = I else -- newline buff = buff.."\n" i = inclinenumber(i) end end--while end --- Reads a string. -- -- @tparam string del The delimiter. -- @treturn string -- @raise if unfinished string or too large escape sequence. local function read_string(del) local i = I while true do local p, _, r = find(z, "([\n\r\\\"\'])", i) -- (long range match) if p then if r == "\n" or r == "\r" then errorline("unfinished string") end i = p if r == "\\" then -- handle escapes i = i + 1 r = sub(z, i, i) if r == "" then break end -- (EOZ error) p = find("abfnrtv\n\r", r, 1, true) if p then -- special escapes if p > 7 then i = inclinenumber(i) else i = i + 1 end elseif find(r, "%D") then -- other non-digits i = i + 1 else -- \xxx sequence local _, q, s = find(z, "^(%d%d?%d?)", i) i = q + 1 if s + 1 > 256 then -- UCHAR_MAX errorline("escape sequence too large") end end--if p else i = i + 1 if r == del then -- ending delimiter I = i return sub(z, buff, i - 1) -- return string end end--if r else break -- (error) end--if p end--while errorline("unfinished string") end --- Initializes lexer for given source _z and source name _sourceid. -- -- @tparam string _z The source code. -- @tparam string _sourceid Name of the source. local function init(_z, _sourceid) z = _z -- source sourceid = _sourceid -- name of source I = 1 -- lexer's position in source ln = 1 -- line number tok = {} -- lexed token list* seminfo = {} -- lexed semantic information list* tokln = {} -- line numbers for messages* -- Initial processing (shbang handling). local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)") if p then -- skip first line I = I + #q addtoken("TK_COMMENT", q) if #r > 0 then inclinenumber(I, true) end end end --- Runs lexer on the given source code. -- -- @tparam string source The Lua source to scan. -- @tparam ?string source_name Name of the source (optional). -- @treturn {string,...} A list of lexed tokens. -- @treturn {string,...} A list of semantic information (lexed strings). -- @treturn {int,...} A list of line numbers. function M.lex(source, source_name) init(source, source_name) while true do--outer local i = I -- inner loop allows break to be used to nicely section tests while true do --luacheck: ignore 512 local p, _, r = find(z, "^([_%a][_%w]*)", i) if p then I = i + #r if kw[r] then addtoken("TK_KEYWORD", r) -- reserved word (keyword) else addtoken("TK_NAME", r) -- identifier end break -- (continue) end local p, _, r = find(z, "^(%.?)%d", i) if p then -- numeral if r == "." then i = i + 1 end local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i) --luacheck: ignore 421 i = q + 1 if #r == 1 then -- optional exponent if match(z, "^[%+%-]", i) then -- optional sign i = i + 1 end end local _, q = find(z, "^[_%w]*", i) I = q + 1 local v = sub(z, p, q) -- string equivalent if not tonumber(v) then -- handles hex test also errorline("malformed number") end addtoken("TK_NUMBER", v) break -- (continue) end local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i) if p then if t == "\n" or t == "\r" then -- newline inclinenumber(i, true) else I = q + 1 -- whitespace addtoken("TK_SPACE", r) end break -- (continue) end local _, q = find(z, "^::", i) if q then I = q + 1 addtoken("TK_OP", "::") break -- (continue) end local r = match(z, "^%p", i) if r then buff = i local p = find("-[\"\'.=<>~", r, 1, true) --luacheck: ignore 421 if p then -- two-level if block for punctuation/symbols if p <= 2 then if p == 1 then -- minus local c = match(z, "^%-%-(%[?)", i) if c then i = i + 2 local sep = -1 if c == "[" then sep = skip_sep(i) end if sep >= 0 then -- long comment addtoken("TK_LCOMMENT", read_long_string(false, sep)) else -- short comment I = find(z, "[\n\r]", i) or (#z + 1) addtoken("TK_COMMENT", sub(z, buff, I - 1)) end break -- (continue) end -- (fall through for "-") else -- [ or long string local sep = skip_sep(i) if sep >= 0 then addtoken("TK_LSTRING", read_long_string(true, sep)) elseif sep == -1 then addtoken("TK_OP", "[") else errorline("invalid long string delimiter") end break -- (continue) end elseif p <= 5 then if p < 5 then -- strings I = i + 1 addtoken("TK_STRING", read_string(r)) break -- (continue) end r = match(z, "^%.%.?%.?", i) -- .|..|... dots -- (fall through) else -- relational r = match(z, "^%p=?", i) -- (fall through) end end I = i + #r addtoken("TK_OP", r) -- for other symbols, fall through break -- (continue) end local r = sub(z, i, i) if r ~= "" then I = i + 1 addtoken("TK_OP", r) -- other single-char tokens break end addtoken("TK_EOS", "") -- end of stream, return tok, seminfo, tokln -- exit here end--while inner end--while outer end return M