351 lines
10 KiB
Lua
351 lines
10 KiB
Lua
|
---------
|
||
|
-- Lua 5.1+ lexical analyzer written in Lua.
|
||
|
--
|
||
|
-- This file is part of LuaSrcDiet, based on Yueliang material.
|
||
|
--
|
||
|
-- **Notes:**
|
||
|
--
|
||
|
-- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0,
|
||
|
-- with significant modifications to handle LuaSrcDiet's needs:
|
||
|
-- (1) llex.error is an optional error function handler,
|
||
|
-- (2) seminfo for strings include their delimiters and no
|
||
|
-- translation operations are performed on them.
|
||
|
-- * ADDED shbang handling has been added to support executable scripts.
|
||
|
-- * NO localized decimal point replacement magic.
|
||
|
-- * NO limit to number of lines.
|
||
|
-- * NO support for compatible long strings (LUA\_COMPAT_LSTR).
|
||
|
-- * Added goto keyword and double-colon operator (Lua 5.2+).
|
||
|
----
|
||
|
local find = string.find
|
||
|
local fmt = string.format
|
||
|
local match = string.match
|
||
|
local sub = string.sub
|
||
|
local tonumber = tonumber
|
||
|
|
||
|
local M = {}
|
||
|
|
||
|
local kw = {}
|
||
|
for v in ([[
|
||
|
and break do else elseif end false for function goto if in
|
||
|
local nil not or repeat return then true until while]]):gmatch("%S+") do
|
||
|
kw[v] = true
|
||
|
end
|
||
|
|
||
|
local z, -- source stream
|
||
|
sourceid, -- name of source
|
||
|
I, -- position of lexer
|
||
|
buff, -- buffer for strings
|
||
|
ln, -- line number
|
||
|
tok, -- lexed token list
|
||
|
seminfo, -- lexed semantic information list
|
||
|
tokln -- line numbers for messages
|
||
|
|
||
|
|
||
|
--- Adds information to token listing.
|
||
|
--
|
||
|
-- @tparam string token
|
||
|
-- @tparam string info
|
||
|
local function addtoken(token, info)
|
||
|
local i = #tok + 1
|
||
|
tok[i] = token
|
||
|
seminfo[i] = info
|
||
|
tokln[i] = ln
|
||
|
end
|
||
|
|
||
|
--- Handles line number incrementation and end-of-line characters.
|
||
|
--
|
||
|
-- @tparam int i Position of lexer in the source stream.
|
||
|
-- @tparam bool is_tok
|
||
|
-- @treturn int
|
||
|
local function inclinenumber(i, is_tok)
|
||
|
local old = sub(z, i, i)
|
||
|
i = i + 1 -- skip '\n' or '\r'
|
||
|
local c = sub(z, i, i)
|
||
|
if (c == "\n" or c == "\r") and (c ~= old) then
|
||
|
i = i + 1 -- skip '\n\r' or '\r\n'
|
||
|
old = old..c
|
||
|
end
|
||
|
if is_tok then addtoken("TK_EOL", old) end
|
||
|
ln = ln + 1
|
||
|
I = i
|
||
|
return i
|
||
|
end
|
||
|
|
||
|
--- Returns a chunk name or id, no truncation for long names.
|
||
|
--
|
||
|
-- @treturn string
|
||
|
local function chunkid()
|
||
|
if sourceid and match(sourceid, "^[=@]") then
|
||
|
return sub(sourceid, 2) -- remove first char
|
||
|
end
|
||
|
return "[string]"
|
||
|
end
|
||
|
|
||
|
--- Formats error message and throws error.
|
||
|
--
|
||
|
-- A simplified version, does not report what token was responsible.
|
||
|
--
|
||
|
-- @tparam string s
|
||
|
-- @tparam int line The line number.
|
||
|
-- @raise
|
||
|
local function errorline(s, line)
|
||
|
local e = M.error or error
|
||
|
e(fmt("%s:%d: %s", chunkid(), line or ln, s))
|
||
|
end
|
||
|
|
||
|
--- Counts separators (`="` in a long string delimiter.
|
||
|
--
|
||
|
-- @tparam int i Position of lexer in the source stream.
|
||
|
-- @treturn int
|
||
|
local function skip_sep(i)
|
||
|
local s = sub(z, i, i)
|
||
|
i = i + 1
|
||
|
local count = #match(z, "=*", i)
|
||
|
i = i + count
|
||
|
I = i
|
||
|
return (sub(z, i, i) == s) and count or (-count) - 1
|
||
|
end
|
||
|
|
||
|
--- Reads a long string or long comment.
|
||
|
--
|
||
|
-- @tparam bool is_str
|
||
|
-- @tparam string sep
|
||
|
-- @treturn string
|
||
|
-- @raise if unfinished long string or comment.
|
||
|
local function read_long_string(is_str, sep)
|
||
|
local i = I + 1 -- skip 2nd '['
|
||
|
local c = sub(z, i, i)
|
||
|
if c == "\r" or c == "\n" then -- string starts with a newline?
|
||
|
i = inclinenumber(i) -- skip it
|
||
|
end
|
||
|
while true do
|
||
|
local p, _, r = find(z, "([\r\n%]])", i) -- (long range match)
|
||
|
if not p then
|
||
|
errorline(is_str and "unfinished long string" or
|
||
|
"unfinished long comment")
|
||
|
end
|
||
|
i = p
|
||
|
if r == "]" then -- delimiter test
|
||
|
if skip_sep(i) == sep then
|
||
|
buff = sub(z, buff, I)
|
||
|
I = I + 1 -- skip 2nd ']'
|
||
|
return buff
|
||
|
end
|
||
|
i = I
|
||
|
else -- newline
|
||
|
buff = buff.."\n"
|
||
|
i = inclinenumber(i)
|
||
|
end
|
||
|
end--while
|
||
|
end
|
||
|
|
||
|
--- Reads a string.
|
||
|
--
|
||
|
-- @tparam string del The delimiter.
|
||
|
-- @treturn string
|
||
|
-- @raise if unfinished string or too large escape sequence.
|
||
|
local function read_string(del)
|
||
|
local i = I
|
||
|
while true do
|
||
|
local p, _, r = find(z, "([\n\r\\\"\'])", i) -- (long range match)
|
||
|
if p then
|
||
|
if r == "\n" or r == "\r" then
|
||
|
errorline("unfinished string")
|
||
|
end
|
||
|
i = p
|
||
|
if r == "\\" then -- handle escapes
|
||
|
i = i + 1
|
||
|
r = sub(z, i, i)
|
||
|
if r == "" then break end -- (EOZ error)
|
||
|
p = find("abfnrtv\n\r", r, 1, true)
|
||
|
|
||
|
if p then -- special escapes
|
||
|
if p > 7 then
|
||
|
i = inclinenumber(i)
|
||
|
else
|
||
|
i = i + 1
|
||
|
end
|
||
|
|
||
|
elseif find(r, "%D") then -- other non-digits
|
||
|
i = i + 1
|
||
|
|
||
|
else -- \xxx sequence
|
||
|
local _, q, s = find(z, "^(%d%d?%d?)", i)
|
||
|
i = q + 1
|
||
|
if s + 1 > 256 then -- UCHAR_MAX
|
||
|
errorline("escape sequence too large")
|
||
|
end
|
||
|
|
||
|
end--if p
|
||
|
else
|
||
|
i = i + 1
|
||
|
if r == del then -- ending delimiter
|
||
|
I = i
|
||
|
return sub(z, buff, i - 1) -- return string
|
||
|
end
|
||
|
end--if r
|
||
|
else
|
||
|
break -- (error)
|
||
|
end--if p
|
||
|
end--while
|
||
|
errorline("unfinished string")
|
||
|
end
|
||
|
|
||
|
|
||
|
--- Initializes lexer for given source _z and source name _sourceid.
|
||
|
--
|
||
|
-- @tparam string _z The source code.
|
||
|
-- @tparam string _sourceid Name of the source.
|
||
|
local function init(_z, _sourceid)
|
||
|
z = _z -- source
|
||
|
sourceid = _sourceid -- name of source
|
||
|
I = 1 -- lexer's position in source
|
||
|
ln = 1 -- line number
|
||
|
tok = {} -- lexed token list*
|
||
|
seminfo = {} -- lexed semantic information list*
|
||
|
tokln = {} -- line numbers for messages*
|
||
|
|
||
|
-- Initial processing (shbang handling).
|
||
|
local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)")
|
||
|
if p then -- skip first line
|
||
|
I = I + #q
|
||
|
addtoken("TK_COMMENT", q)
|
||
|
if #r > 0 then inclinenumber(I, true) end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
--- Runs lexer on the given source code.
|
||
|
--
|
||
|
-- @tparam string source The Lua source to scan.
|
||
|
-- @tparam ?string source_name Name of the source (optional).
|
||
|
-- @treturn {string,...} A list of lexed tokens.
|
||
|
-- @treturn {string,...} A list of semantic information (lexed strings).
|
||
|
-- @treturn {int,...} A list of line numbers.
|
||
|
function M.lex(source, source_name)
|
||
|
init(source, source_name)
|
||
|
|
||
|
while true do--outer
|
||
|
local i = I
|
||
|
-- inner loop allows break to be used to nicely section tests
|
||
|
while true do --luacheck: ignore 512
|
||
|
|
||
|
local p, _, r = find(z, "^([_%a][_%w]*)", i)
|
||
|
if p then
|
||
|
I = i + #r
|
||
|
if kw[r] then
|
||
|
addtoken("TK_KEYWORD", r) -- reserved word (keyword)
|
||
|
else
|
||
|
addtoken("TK_NAME", r) -- identifier
|
||
|
end
|
||
|
break -- (continue)
|
||
|
end
|
||
|
|
||
|
local p, _, r = find(z, "^(%.?)%d", i)
|
||
|
if p then -- numeral
|
||
|
if r == "." then i = i + 1 end
|
||
|
local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i) --luacheck: ignore 421
|
||
|
i = q + 1
|
||
|
if #r == 1 then -- optional exponent
|
||
|
if match(z, "^[%+%-]", i) then -- optional sign
|
||
|
i = i + 1
|
||
|
end
|
||
|
end
|
||
|
local _, q = find(z, "^[_%w]*", i)
|
||
|
I = q + 1
|
||
|
local v = sub(z, p, q) -- string equivalent
|
||
|
if not tonumber(v) then -- handles hex test also
|
||
|
errorline("malformed number")
|
||
|
end
|
||
|
addtoken("TK_NUMBER", v)
|
||
|
break -- (continue)
|
||
|
end
|
||
|
|
||
|
local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i)
|
||
|
if p then
|
||
|
if t == "\n" or t == "\r" then -- newline
|
||
|
inclinenumber(i, true)
|
||
|
else
|
||
|
I = q + 1 -- whitespace
|
||
|
addtoken("TK_SPACE", r)
|
||
|
end
|
||
|
break -- (continue)
|
||
|
end
|
||
|
|
||
|
local _, q = find(z, "^::", i)
|
||
|
if q then
|
||
|
I = q + 1
|
||
|
addtoken("TK_OP", "::")
|
||
|
break -- (continue)
|
||
|
end
|
||
|
|
||
|
local r = match(z, "^%p", i)
|
||
|
if r then
|
||
|
buff = i
|
||
|
local p = find("-[\"\'.=<>~", r, 1, true) --luacheck: ignore 421
|
||
|
if p then
|
||
|
|
||
|
-- two-level if block for punctuation/symbols
|
||
|
if p <= 2 then
|
||
|
if p == 1 then -- minus
|
||
|
local c = match(z, "^%-%-(%[?)", i)
|
||
|
if c then
|
||
|
i = i + 2
|
||
|
local sep = -1
|
||
|
if c == "[" then
|
||
|
sep = skip_sep(i)
|
||
|
end
|
||
|
if sep >= 0 then -- long comment
|
||
|
addtoken("TK_LCOMMENT", read_long_string(false, sep))
|
||
|
else -- short comment
|
||
|
I = find(z, "[\n\r]", i) or (#z + 1)
|
||
|
addtoken("TK_COMMENT", sub(z, buff, I - 1))
|
||
|
end
|
||
|
break -- (continue)
|
||
|
end
|
||
|
-- (fall through for "-")
|
||
|
else -- [ or long string
|
||
|
local sep = skip_sep(i)
|
||
|
if sep >= 0 then
|
||
|
addtoken("TK_LSTRING", read_long_string(true, sep))
|
||
|
elseif sep == -1 then
|
||
|
addtoken("TK_OP", "[")
|
||
|
else
|
||
|
errorline("invalid long string delimiter")
|
||
|
end
|
||
|
break -- (continue)
|
||
|
end
|
||
|
|
||
|
elseif p <= 5 then
|
||
|
if p < 5 then -- strings
|
||
|
I = i + 1
|
||
|
addtoken("TK_STRING", read_string(r))
|
||
|
break -- (continue)
|
||
|
end
|
||
|
r = match(z, "^%.%.?%.?", i) -- .|..|... dots
|
||
|
-- (fall through)
|
||
|
|
||
|
else -- relational
|
||
|
r = match(z, "^%p=?", i)
|
||
|
-- (fall through)
|
||
|
end
|
||
|
end
|
||
|
I = i + #r
|
||
|
addtoken("TK_OP", r) -- for other symbols, fall through
|
||
|
break -- (continue)
|
||
|
end
|
||
|
|
||
|
local r = sub(z, i, i)
|
||
|
if r ~= "" then
|
||
|
I = i + 1
|
||
|
addtoken("TK_OP", r) -- other single-char tokens
|
||
|
break
|
||
|
end
|
||
|
addtoken("TK_EOS", "") -- end of stream,
|
||
|
return tok, seminfo, tokln -- exit here
|
||
|
|
||
|
end--while inner
|
||
|
end--while outer
|
||
|
end
|
||
|
|
||
|
return M
|