--- HTML parsing module for LuaXML
-- @module luaxml-mod-html
-- @author Michal Hoftich -1 then
return utfchar(codepoint)
end
return ""
end
-- declare namespaces
local xmlns = {
HTML = "http://www.w3.org/1999/xhtml",
MathML = "http://www.w3.org/1998/Math/MathML",
SVG = "http://www.w3.org/2000/svg",
XLink = "http://www.w3.org/1999/xlink",
XML = "http://www.w3.org/XML/1998/namespace",
XMLNS = "http://www.w3.org/2000/xmlns/",
}
-- we must make search tree for named entities, as their support
-- is quite messy
local named_entities
if kpse then
named_entities = require "luaxml-namedentities"
else
named_entities = require "luaxml.namedentities"
end
local entity_tree = {children = {}}
local function update_tree(tree, char)
local children = tree.children or {}
local current = children[char] or {}
children[char] = current
tree.children = children
return current
end
-- loop over named entities and update tree
for entity, char in pairs(named_entities) do
local tree = entity_tree
for char in entity:gmatch(".") do
tree = update_tree(tree,char)
end
tree.entity = entity
tree.char = char
end
local function search_entity_tree(tbl)
-- get named entity for the list of characters
local tree = entity_tree
for _,char in ipairs(tbl) do
if tree.children then
tree = tree.children[char]
if not tree then return nil end
else
return nil
end
end
-- print("tree", tree.char)
return tree
end
-- declare basic node types
local Root = {
_type = "root",
xmlns = xmlns.HTML
}
function Root:init()
local o = {}
setmetatable(o, self)
self.__index = self
self.__tostring = function (x) return "_ROOT" end
o.children = {}
return o
end
function Root:add_child(node)
table.insert(self.children, node)
end
local Doctype = {
_type = "doctype"
}
function Doctype:init(name, parent)
local o = {}
setmetatable(o, self)
self.__index = self
self.__tostring = function (x)
if x.data then
return ""
else
return ""
end
end
self.add_child = Root.add_child
o.parent = parent
o.name = name
o.children = {}
return o
end
function Doctype:add_data(data)
self.data = data
end
local Text = {
_type = "text"
}
function Text:init(text, parent)
local o = {}
setmetatable(o, self)
self.__index = self
o.text = text
self.__tostring = function (x) return "'" .. x.text .. "'" end
self.add_child = Root.add_child
o.parent = parent
o.children = {}
return o
end
local Comment = {
_type = "comment"
}
function Comment:init(text, parent)
local o = {}
setmetatable(o, self)
self.__index = self
o.text = text
self.__tostring = function (x) return "" end
self.add_child = Root.add_child
o.parent = parent
o.children = {}
return o
end
local Element = {
_type = "element"
}
function Element:init(tag, parent)
local o = {}
setmetatable(o, self)
self.__index = self
-- tag can be table with unicode characters
if type(tag) == "table" then
o.tag = table.concat(tag)
else
o.tag = tag
end
self.__tostring = function(x)
local attr = {}
for _, el in ipairs(x.attr) do
-- handle attributes
local value
if el.value:match('"') then
value = "'" .. el.value .. "'"
else
value = '"' .. el.value .. '"'
end
attr[#attr+1] = el.name .. "=" .. value
end
local closing = ">"
if x.self_closing then
closing = " />"
end
if #attr > 0 then
return "<" .. x.tag .. " " .. table.concat(attr, " ") .. closing
else
return "<" .. x.tag .. closing
end
end
self.add_child = Root.add_child
o.children = {}
o.attr = {}
o.parent = parent
-- default xmlns
o.xmlns = xmlns.HTML
return o
end
-- state machine functions
-- each function takes HtmlParser as an argument
local HtmlStates = {}
-- declare codepoints for more efficient processing
local less_than = ucodepoint("<")
local greater_than = ucodepoint(">")
local amperesand = ucodepoint("&")
local exclam = ucodepoint("!")
local question = ucodepoint("?")
local solidus = ucodepoint("/")
local equals = ucodepoint("=")
local quoting = ucodepoint('"')
local apostrophe = ucodepoint("'")
local semicolon = ucodepoint(";")
local hyphen = ucodepoint("-")
local dash = ucodepoint("-")
local numbersign = ucodepoint("#")
local smallx = ucodepoint("x")
local bigx = ucodepoint("X")
local right_square = ucodepoint("]")
local EOF = -1 -- special character, meaning end of stream
local null = 0
local function is_upper_alpha(codepoint)
if (64 < codepoint and codepoint < 91) then
return true
end
end
local function is_lower_alpha(codepoint)
if (96 < codepoint and codepoint < 123) then
return true
end
end
local function is_alpha(codepoint)
-- detect if codepoint is alphanumeric
if is_upper_alpha(codepoint) or
is_lower_alpha(codepoint) then
return true
end
return false
end
local function is_numeric(codepoint)
if 47 < codepoint and codepoint < 58 then
return true
end
end
local function is_upper_hex(codepoint)
if 64 < codepoint and codepoint < 71 then
return true
end
end
local function is_lower_hex(codepoint)
if 96 < codepoint and codepoint < 103 then
return true
end
end
local function is_hexadecimal(codepoint)
if is_numeric(codepoint) or
is_lower_hex(codepoint) or
is_upper_hex(codepoint)
then
return true
end
end
local function is_alphanumeric(codepoint)
return is_alpha(codepoint) or is_numeric(codepoint)
end
local function is_space(codepoint)
-- detect space characters
if codepoint==0x0009 or codepoint==0x000A or codepoint==0x000C or codepoint==0x0020 then
return true
end
return false
end
local function is_surrogate(codepoint)
return 0xD800 <= codepoint and codepoint <= 0xDFFF
end
character_entity_replace_table = {
[0x80] = 0x20AC,
[0x82] = 0x201A,
[0x83] = 0x0192,
[0x84] = 0x201E,
[0x85] = 0x2026,
[0x86] = 0x2020,
[0x87] = 0x2021,
[0x88] = 0x02C6,
[0x89] = 0x2030,
[0x8A] = 0x0160,
[0x8B] = 0x2039,
[0x8C] = 0x0152,
[0x8E] = 0x017D,
[0x91] = 0x2018,
[0x92] = 0x2019,
[0x93] = 0x201C,
[0x94] = 0x201D,
[0x95] = 0x2022,
[0x96] = 0x2013,
[0x97] = 0x2014,
[0x98] = 0x02DC,
[0x99] = 0x2122,
[0x9A] = 0x0161,
[0x9B] = 0x203A,
[0x9C] = 0x0153,
[0x9E] = 0x017E,
[0x9F] = 0x0178
}
local function fix_null(codepoint)
if codepoint == null then
return 0xFFFD
else
return codepoint
end
end
HtmlStates.data = function(parser)
-- this is the default state
local codepoint = parser.codepoint
-- print("codepoint", parser.codepoint)
if codepoint == less_than then
-- start of tag
return "tag_open"
elseif codepoint == amperesand then
-- we must save the current state
-- what we will return to after entity
parser.return_state = "data"
return "character_reference"
elseif codepoint == EOF then
parser:emit_eof()
else
parser:emit_character(uchar(codepoint))
end
return "data"
end
HtmlStates.tag_open = function(parser)
-- parse tag contents
local codepoint = parser.codepoint
if codepoint == exclam then
return "markup_declaration_open"
elseif codepoint == solidus then
return "end_tag_open"
elseif codepoint == question then
parser:start_token("comment",{data={}})
return "bogus_comment"
elseif is_alpha(codepoint) then
local data = {
name = {},
attr = {},
current_attr_name = {},
current_attr_value = {},
self_closing = false
}
parser:start_token("start_tag", data)
return parser:tokenize("tag_name")
elseif codepoint == EOF then
parser:emit_character(">")
parser:emit_eof()
else
-- invalid tag
-- emit "<" and reconsume current character as data
parser:emit_character("<")
return parser:tokenize("data")
end
end
HtmlStates.character_reference = function(parser)
-- parse HTML entities
-- initialize temp buffer
parser.temp_buffer = {"&"}
local codepoint = parser.codepoint
if is_alphanumeric(codepoint) then
return parser:tokenize("named_character_reference")
elseif codepoint == numbersign then
table.insert(parser.temp_buffer, uchar(codepoint))
return "numeric_character_reference"
else
parser:flush_temp_buffer()
return parser:tokenize(parser.return_state)
end
end
HtmlStates.named_character_reference = function(parser)
-- named entity parsing is pretty complicated
-- https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
local codepoint = parser.codepoint
-- test if the current entity name is included in the named entity list
local search_table = {}
-- first char in temp buffer is &, which we don't want to lookup in the search tree
for i=2, #parser.temp_buffer do search_table[#search_table+1] = parser.temp_buffer[i] end
if codepoint == semicolon then
-- close named entity
local entity = search_entity_tree(search_table)
if entity and entity.char then
parser:add_entity(entity.char)
else
-- if the current name doesn't correspond to any named entity, flush everything into text
parser:flush_temp_buffer()
return parser:tokenize(parser.return_state)
end
return parser.return_state
else
local char = uchar(codepoint)
-- try if the current entity name is in the named entity search tree
table.insert(search_table, char)
local entity = search_entity_tree(search_table)
if entity then
-- keep parsing name entity while we match a name
table.insert(parser.temp_buffer, char)
return "named_character_reference"
else
-- here this will be more complicated
if #search_table > 1 then
local token = parser.current_token
if token.type == "start_tag" and (codepoint == equals or is_alphanumeric(codepoint)) then
-- in attribute value, flush characters and retokenize
parser:flush_temp_buffer()
return parser:tokenize(parser.return_state)
else
-- try to get entity for characters preceding the current character
table.remove(search_table)
local newentity = search_entity_tree(search_table)
if newentity and newentity.char then
parser:add_entity(newentity.char)
else
-- we need to find if parts of the current substring match a named entity
-- for example ¬it; -> ¬it; but ∉ -> ∉
local rest = {}
-- loop over the table with characters, and try to find if it matches entity
for i = #search_table, 1,-1 do
local removed_char = table.remove(search_table)
--
table.insert(rest, 1, removed_char)
newentity = search_entity_tree(search_table)
if newentity and newentity.char then
parser:add_entity(newentity.char)
parser.temp_buffer = rest
break
end
end
-- replace temporary buffer witch characters that followed the matched entity
parser:flush_temp_buffer()
end
return parser:tokenize(parser.return_state)
end
else
-- search table contains only the current character
parser:flush_temp_buffer()
return parser:tokenize(parser.return_state)
end
end
end
end
HtmlStates.numeric_character_reference = function(parser)
-- this variable will hold the number
local codepoint = parser.codepoint
parser.character_reference_code = 0
if codepoint == smallx or codepoint == bigx then
-- hexadecimal entity
table.insert(parser.temp_buffer, uchar(codepoint))
return "hexadecimal_character_reference_start"
else
-- try decimal entity
return parser:tokenize("decimal_character_reference_start")
end
end
HtmlStates.hexadecimal_character_reference_start = function(parser)
local codepoint = parser.codepoint
if is_hexadecimal(codepoint) then
return parser:tokenize("hexadecimal_character_reference")
else
parser:flush_temp_buffer()
return parser:tokenize(parser.return_state)
end
end
HtmlStates.decimal_character_reference_start = function(parser)
local codepoint = parser.codepoint
if is_numeric(codepoint) then
return parser:tokenize("decimal_character_reference")
else
parser:flush_temp_buffer()
return parser:tokenize(parser.return_state)
end
end
HtmlStates.decimal_character_reference = function(parser)
local codepoint = parser.codepoint
-- helper functions for easier working with the character_reference_code
local function multiply(number)
parser.character_reference_code = parser.character_reference_code * number
end
local function add(number)
parser.character_reference_code = parser.character_reference_code + number
end
if is_numeric(codepoint) then
multiply(10)
add(codepoint - 0x30)
elseif codepoint == semicolon then
return "numeric_reference_end_state"
else
-- this adds current entity
parser:tokenize("numeric_reference_end_state")
-- now tokenize the current character
return parser:tokenize(parser.return_state)
end
return "decimal_character_reference"
end
HtmlStates.hexadecimal_character_reference = function(parser)
local codepoint = parser.codepoint
-- helper functions for easier working with the character_reference_code
local function multiply(number)
parser.character_reference_code = parser.character_reference_code * number
end
local function add(number)
parser.character_reference_code = parser.character_reference_code + number
end
if is_numeric(codepoint) then
multiply(16)
add(codepoint - 0x30)
elseif is_upper_hex(codepoint) then
multiply(16)
add(codepoint - 0x37)
elseif is_lower_hex(codepoint) then
multiply(16)
add(codepoint - 0x57)
elseif codepoint == semicolon then
return "numeric_reference_end_state"
else
-- this adds current entity
parser:tokenize("numeric_reference_end_state")
-- now tokenize the current character
return parser:tokenize(parser.return_state)
end
return "hexadecimal_character_reference"
end
HtmlStates.numeric_reference_end_state = function(parser)
-- in this state, we don't need to
local character = parser.character_reference_code
-- we need to clean invalid character codes
if character == 0x00 or
character > 0x10FFFF or
is_surrogate(character)
then
character = 0xFFFD
-- should we add special support for "noncharacter"? I think we can pass them to the output anyway
elseif character_entity_replace_table[character] then
character = character_entity_replace_table[character]
end
parser:add_entity(uchar(character))
return parser.return_state
end
HtmlStates.markup_declaration_open = function(parser)
-- started by is consumed as token.data
return "consume_doctype_data"
end
end
HtmlStates.consume_doctype_data = function(parser)
-- this state just reads everything inside doctype as data
local codepoint = parser.codepoint
if codepoint == greater_than then
parser:emit()
return "data"
elseif codepoint == EOF then
doctype_eof(parser)
else
parser:append_token_data("data", uchar(codepoint))
return "consume_doctype_data"
end
end
HtmlStates.tag_name = function(parser)
local codepoint = parser.codepoint
codepoint = fix_null(codepoint)
if is_space(codepoint) then
return "before_attribute_name"
elseif codepoint == solidus then
return "self_closing_tag"
elseif codepoint == greater_than then
parser:emit()
return "data"
elseif is_upper_alpha(codepoint) then
local lower = string.lower(uchar(codepoint))
parser:append_token_data("name", lower)
elseif codepoint==EOF then
parser:emit()
parser:emit_eof()
else
local char = uchar(codepoint)
parser:append_token_data("name", char)
end
return "tag_name"
end
HtmlStates.self_closing_tag = function(parser)
local codepoint = parser.codepoint
if codepoint == greater_than then
parser.current_token.self_closing = true
parser:emit()
return "data"
else
return parser:tokenize("before_attribute_name")
end
end
HtmlStates.before_attribute_name = function(parser)
local codepoint = parser.codepoint
if is_space(codepoint) then
-- ignore spacing
return "before_attribute_name"
elseif codepoint == solidus or codepoint == greater_than then
-- reconsume in after_attribute_name
return parser:tokenize("after_attribute_name")
elseif codepoint == equals then
-- ToDo: handle https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name
else
-- start new attribute
parser:start_attribute()
return parser:tokenize("attribute_name")
end
end
HtmlStates.attribute_name = function(parser)
local codepoint = parser.codepoint
if is_space(codepoint)
or codepoint == solidus
or codepoint == greater_than
then
return parser:tokenize("after_attribute_name")
elseif codepoint == equals then
return "before_attribute_value"
elseif is_upper_alpha(codepoint) then
-- lowercase attribute names
local lower = string.lower(uchar(codepoint))
parser:append_token_data("current_attr_name", lower)
return "attribute_name"
else
parser:append_token_data("current_attr_name", uchar(codepoint))
return "attribute_name"
end
end
HtmlStates.after_attribute_name = function(parser)
local codepoint = parser.codepoint
if is_space(codepoint) then
return "after_attribute_name"
elseif codepoint == equals then
return "before_attribute_value"
elseif codepoint == solidus then
return "self_closing_tag"
elseif codepoint == greater_than then
parser:emit()
return "data"
else
parser:start_attribute()
return parser:tokenize("attribute_name")
end
end
HtmlStates.before_attribute_value = function(parser)
local codepoint = parser.codepoint
if is_space(codepoint) then
return "before_attribute_value"
elseif codepoint == quoting then
return "attribute_value_quoting"
elseif codepoint == apostrophe then
return "attribute_value_apostrophe"
elseif codepoint == greater_than then
parser:emit()
return "data"
else
return parser:tokenize("attribute_value_unquoted")
end
end
HtmlStates.attribute_value_quoting = function(parser)
local codepoint = parser.codepoint
if codepoint == quoting then
return "after_attribute_value_quoting"
elseif codepoint == amperesand then
parser.return_state = "attribute_value_quoting"
return "character_reference"
else
parser:append_token_data("current_attr_value", uchar(codepoint))
return "attribute_value_quoting"
end
end
HtmlStates.attribute_value_apostrophe = function(parser)
local codepoint = parser.codepoint
if codepoint == apostrophe then
return "after_attribute_value_quoting"
elseif codepoint == amperesand then
parser.return_state = "attribute_value_apostrophe"
return "character_reference"
else
parser:append_token_data("current_attr_value", uchar(codepoint))
return "attribute_value_apostrophe"
end
end
HtmlStates.attribute_value_unquoted = function(parser)
local codepoint = parser.codepoint
if is_space(codepoint) then
return "before_attribute_name"
elseif codepoint == amperesand then
parser.return_state = "attribute_value_unquoted"
return "character_reference"
elseif codepoint == greater_than then
parser:emit()
return "data"
else
parser:append_token_data("current_attr_value", uchar(codepoint))
return "attribute_value_unquoted"
end
end
HtmlStates.after_attribute_value_quoting = function(parser)
local codepoint = parser.codepoint
if is_space(codepoint) then
return "before_attribute_name"
elseif codepoint == solidus then
return "self_closing_tag"
elseif codepoint == greater_than then
parser:emit()
return "data"
else
return parser:tokenize("before_attribute_name")
end
end
HtmlStates.rcdata = function(parser)
-- this is the default state
local codepoint = parser.codepoint
-- print("codepoint", parser.codepoint)
codepoint = fix_null(codepoint)
if codepoint == less_than then
-- start of tag
return "rcdata_less_than"
elseif codepoint == amperesand then
-- we must save the current state
-- what we will return to after entity
parser.return_state = "rcdata"
return "character_reference"
elseif codepoint == EOF then
parser:emit_eof()
else
parser:emit_character(uchar(codepoint))
end
return "rcdata"
end
local function discard_rcdata_end_tag(parser, text)
parser:discard_token()
parser:emit_character(text)
end
HtmlStates.rcdata_less_than = function(parser)
local codepoint = parser.codepoint
if codepoint == solidus then
return "rcdata_end_tag_open"
else
discard_rcdata_end_tag(parser, "<")
return parser:tokenize("rcdata")
end
end
HtmlStates.rcdata_end_tag_open = function(parser)
local codepoint = parser.codepoint
if is_alpha(codepoint) then
parser:start_token("end_tag", {name={}})
parser.temp_buffer = {}
return parser:tokenize("rcdata_end_tag_name")
else
discard_rcdata_end_tag(parser, "")
return parser:tokenize("rcdata")
end
end
HtmlStates.rcdata_end_tag_name = function(parser)
-- we need to find name of the currently opened tag
local parent = parser:get_parent() or {}
local opened_tag = parent.tag
local current_tag = table.concat(parser.current_token.name or {})
local codepoint = parser.codepoint
if is_upper_alpha(codepoint) then
parser:append_token_data("name", uchar(codepoint + 0x20))
-- insert current char to temp buffer
table.insert(parser.temp_buffer, uchar(codepoint))
return "rcdata_end_tag_name"
elseif is_lower_alpha(codepoint) then
parser:append_token_data("name", uchar(codepoint))
table.insert(parser.temp_buffer, uchar(codepoint))
return "rcdata_end_tag_name"
elseif opened_tag == current_tag then
if is_space(codepoint) then
return "before_attribute_name"
elseif codepoint == solidus then
return "self_closing_tag"
elseif codepoint == greater_than then
parser:emit()
return "data"
end
else
discard_rcdata_end_tag(parser, "" .. table.concat(parser.temp_buffer))
parser.temp_buffer = {}
return parser:tokenize("rcdata")
end
end
HtmlStates.rawtext = function(parser)
local codepoint = parser.codepoint
codepoint = fix_null(codepoint)
if codepoint == less_than then
return "rawtext_less_than"
elseif codepoint == EOF then
parser:emit_eof()
else
parser:emit_character(uchar(codepoint))
return "rawtext"
end
end
HtmlStates.rawtext_less_than = function(parser)
local codepoint = parser.codepoint
if codepoint == solidus then
return "rawtext_end_tag_open"
else
parser:emit_character("<")
return parser:tokenize("rawtext")
end
end
HtmlStates.rawtext_end_tag_open = function(parser)
local codepoint = parser.codepoint
if is_alpha(codepoint) then
parser:start_token("end_tag", {name={}})
parser.temp_buffer = {}
return parser:tokenize("rawtext_end_tag_name")
else
parser:emit_character("")
return parser:tokenize("rawtext")
end
end
HtmlStates.rawtext_end_tag_name = function(parser)
-- we need to find name of the currently opened tag
local parent = parser:get_parent() or {}
local opened_tag = parent.tag
local current_tag = table.concat(parser.current_token.name or {})
local codepoint = parser.codepoint
if is_upper_alpha(codepoint) then
parser:append_token_data("name", uchar(codepoint + 0x20))
table.insert(parser.temp_buffer, uchar(codepoint))
return "rawtext_end_tag_name"
elseif is_lower_alpha(codepoint) then
parser:append_token_data("name", uchar(codepoint))
table.insert(parser.temp_buffer, uchar(codepoint))
return "rawtext_end_tag_name"
elseif opened_tag == current_tag then
if is_space(codepoint) then
return "before_attribute_name"
elseif codepoint == solidus then
return "self_closing_tag"
elseif codepoint == greater_than then
parser:emit()
return "data"
end
else
discard_rcdata_end_tag(parser, "" .. table.concat(parser.temp_buffer))
parser.temp_buffer = {}
return parser:tokenize("rawtext")
end
end
HtmlStates.script_data = function(parser)
local codepoint = parser.codepoint
codepoint = fix_null(codepoint)
if codepoint == less_than then
return "script_data_less_than"
elseif codepoint == EOF then
parser:emit_eof()
else
parser:emit_character(uchar(codepoint))
return "script_data"
end
end
HtmlStates.script_data_less_than = function(parser)
local codepoint = parser.codepoint
if codepoint == solidus then
parser.temp_buffer = {}
return "script_data_end_tag_open"
elseif codepoint == exclam then
parser:emit_character("")
return "script_data"
elseif codepoint == EOF then
parser:emit_eof()
else
parser:emit_character(uchar(codepoint))
return "script_data_escaped"
end
end
HtmlStates.script_data_escaped_less_than_sign = function(parser)
local codepoint = parser.codepoint
if codepoint == solidus then
parser.temp_buffer = {}
return "script_data_escaped_end_tag_open"
elseif is_alpha(codepoint) then
parser.temp_buffer = {}
parser:emit_character("<")
return parser:tokenize("script_data_double_escape_start")
else
parser:emit_character("<")
return parser:tokenize("script_data_escaped")
end
end
HtmlStates.script_data_escaped_end_tag_open = function(parser)
local codepoint = parser.codepoint
if is_alpha(codepoint) then
parser:start_token("end_tag", {name={}})
return parser:tokenize("script_data_escaped_end_tag_name")
else
parser:emit_character("")
return parser:tokenize("script_data_escaped")
end
end
HtmlStates.script_data_escaped_end_tag_name = function(parser)
-- we need to find name of the currently opened tag
local parent = parser:get_parent() or {}
local opened_tag = parent.tag
local current_tag = table.concat(parser.current_token.name or {})
local codepoint = parser.codepoint
if is_upper_alpha(codepoint) then
parser:append_token_data("name", uchar(codepoint + 0x20))
table.insert(parser.temp_buffer, uchar(codepoint))
return "script_data_escaped_end_tag_name"
elseif is_lower_alpha(codepoint) then
parser:append_token_data("name", uchar(codepoint))
table.insert(parser.temp_buffer, uchar(codepoint))
return "script_data_escaped_end_tag_name"
elseif opened_tag == current_tag then
if is_space(codepoint) then
return "before_attribute_name"
elseif codepoint == solidus then
return "self_closing_tag"
elseif codepoint == greater_than then
parser:emit()
return "data"
end
else
discard_rcdata_end_tag(parser, "" .. table.concat(parser.temp_buffer))
parser.temp_buffer = {}
return parser:tokenize("script_data_escaped")
end
end
HtmlStates.script_data_double_escape_start = function(parser)
local codepoint = parser.codepoint
if is_alpha(codepoint) or
codepoint == solidus or
codepoint == greater_than
then
local current_tag = table.concat(parser.current_token.name or {})
parser:emit_character(uchar(codepoint))
if current_tag == "script" then
return "script_data_double_escaped"
else
return "script_data_escaped"
end
elseif is_upper_alpha(codepoint) then
parser:emit_character(uchar(codepoint))
table.insert(parser.temp_buffer, uchar(codepoint) + 0x20)
return "script_data_double_escape_start"
elseif is_lower_alpha(codepoint) then
parser:emit_character(uchar(codepoint))
table.insert(parser.temp_buffer, uchar(codepoint))
return "script_data_double_escape_start"
else
return parser:tokenize("script_data_escaped")
end
end
HtmlStates.script_data_double_escaped = function(parser)
local codepoint = parser.codepoint
codepoint = fix_null(codepoint)
if codepoint == hyphen then
parser:emit_character("-")
return "script_data_double_escaped_dash"
elseif codepoint == less_than then
parser:emit_character("<")
return "script_data_double_escaped_less_than_sign"
elseif codepoint == EOF then
parser:emit_eof()
else
parser:emit_character(uchar(codepoint))
return "script_data_double_escaped"
end
end
HtmlStates.script_data_double_escaped_dash = function(parser)
local codepoint = parser.codepoint
codepoint = fix_null(codepoint)
if codepoint == hyphen then
parser:emit_character("-")
return "script_data_double_escaped_dash"
elseif codepoint == less_than then
parser:emit_character("<")
return "script_data_double_escaped_less_than_sign"
elseif codepoint == greater_than then
parser:emit_character(">")
return "script_data"
elseif codepoint == EOF then
parser:emit_eof()
else
parser:emit_character(uchar(codepoint))
return "script_data_double_escaped"
end
end
HtmlStates.script_data_double_escaped_less_than_sign = function(parser)
local codepoint = parser.codepoint
if codepoint == solidus then
parser:emit("/")
return "script_data_double_escape_end"
else
return parser:tokenize("script_data_double_escaped")
end
end
HtmlStates.script_data_double_escape_end = function(parser)
local codepoint = parser.codepoint
if is_alpha(codepoint) or
codepoint == solidus or
codepoint == greater_than
then
local current_tag = table.concat(parser.current_token.name or {})
parser:emit_character(uchar(codepoint))
if current_tag == "script" then
return "script_data_escaped"
else
return "script_data_double_escaped"
end
elseif is_upper_alpha(codepoint) then
parser:emit_character(uchar(codepoint))
table.insert(parser.temp_buffer, uchar(codepoint) + 0x20)
return "script_data_double_escape_start"
elseif is_lower_alpha(codepoint) then
parser:emit_character(uchar(codepoint))
table.insert(parser.temp_buffer, uchar(codepoint))
return "script_data_double_escape_start"
else
return parser:tokenize("script_data_double_escaped")
end
end
-- formatting elements needs special treatment
local formatting_element_names ={
a = true, b = true, big = true, code = true, em = true, font = true, i = true, nobr = true, s = true, small = true, strike = true, strong = true, tt = true, u = true
}
local function is_formatting_element(name)
return formatting_element_names[name]
end
local function hash_from_array(tbl)
local t = {}
for _, v in ipairs(tbl) do t[v] = true end
return t
end
local special_elements_list = hash_from_array {"address", "applet", "area", "article", "aside",
"base", "basefont", "bgsound", "blockquote", "body", "br", "button", "caption",
"center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt",
"embed", "fieldset", "figcaption", "figure", "footer", "form", "frame",
"frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup",
"hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing",
"main", "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript",
"object", "ol", "p", "param", "plaintext", "pre", "script", "section",
"select", "source", "style", "summary", "table", "tbody", "td", "template",
"textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp",
"mi","mo","mn","ms","mtext", "annotation-xml","foreignObject","desc", "title"
}
local function is_special(name)
return special_elements_list[name]
end
-- these lists are used in HtmlParser:generate_implied_endtags()
local implied_endtags = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true, rb = true, rp = true, rd = true, trc = true}
local implied_endtags_thoroughly = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true,
rb = true, rp = true, rd = true, trc = true, caption = true, colgroup = true, tbody = true, td = true,
tfoot = true, th = true, thead = true, tr = true
}
-- find if unfinished tags list contain a tag
-- it fails if any element from element_list is matched before that tag
local function is_in_scope(parser, target, element_list)
for i = #parser.unfinished, 1, -1 do
local node = parser.unfinished[i]
local tag = node.tag
if tag == target then
return true
elseif element_list[tag] then
return false
end
end
return false
end
local particular_scope_elements = { applet = true, caption = true, html = true, table = true, td = true,
th = true, marquee = true, object = true, template = true, mi = true, mo = true, mn = true,
ms = true, mtext = true, ["annotation-xml"] = true, foreignObject = true, desc = true, title = true,
}
local function is_in_particular_scope(parser, target)
return is_in_scope(parser, target, particular_scope_elements)
end
-- derived scope lists
--
-- list_item scope
local list_item_scope_elements = {ol = true, ul = true}
for k,v in pairs(particular_scope_elements) do list_item_scope_elements[k] = v end
local function is_in_list_item_scope(parser, target)
return is_in_scope(parser, target, list_item_scope_elements)
end
-- button scope
local button_scope_elements = {button = true}
for k,v in pairs(particular_scope_elements) do button_scope_elements[k] = v end
local function is_in_button_scope(parser, target)
return is_in_scope(parser, target, button_scope_elements)
end
-- table scope
local table_scope_elements = {html = true, table = true, template = true}
local function is_in_table_scope(parser, target)
return is_in_scope(parser, target, table_scope_elements)
end
-- select scope
local function is_in_select_scope(parser, target)
-- this scope is specific, because it supports all tags except two
for i = #parser.unfinished, 1, -1 do
local node = parser.unfinished[i]
local tag = node.tag
if tag == target then
return true
elseif tag == "optgroup" or tag == "option" then
-- only these two tags are supported
else
return false
end
end
return false
end
-- List of active formatting elements
-- https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements
-- we don't implement it yet, but maybe in the future.
local HtmlTreeStates = {}
--- @type HtmlParser
local HtmlParser = {}
--- Initialize the HTML Object
---@param body string HTML to be parsed
---@return table initialized object
function HtmlParser:init(body)
local o ={}
setmetatable(o, self)
self.__index = self
o.body = self:normalize_newlines(body) -- HTML string
o.position = 0 -- position in the parsed string
o.unfinished = {} -- insert Root node into the list of opened elements
o.Document = Root:init()
o.default_state = "data" -- default state machine state
o.state = o.default_state -- working state of the machine
o.return_state = o.default_state -- special state set by entities parsing
o.temp_buffer = {} -- keep temporary data
o.current_token = {type="start"} -- currently processed token
o.insertion_mode = "initial" -- tree construction state
o.head_pointer = nil -- pointer to the Head element
o.form_pointer = nil
o.active_formatting = {} -- list of active formatting elements
o.scripting_flag = false -- we will not support scripting
return o
end
function HtmlParser:normalize_newlines(body)
-- we must normalize newlines
return body:gsub("\r\n", "\n"):gsub("\r", "\n")
end
-- declare void elements
local self_closing_tags_list = {"area", "base", "br", "col", "embed", "hr", "img", "input",
"link", "meta", "param", "source", "track", "wbr"}
local self_closing_tags = {}
for _,v in ipairs(self_closing_tags_list) do self_closing_tags[v] = true end
--- Execute the HTML parser
--- @return table Root node of the HTML DOM
function HtmlParser:parse()
-- we assume utf8 input, you must convert it yourself if the source is
-- in a different encoding. for example using luaxml-encodings library
self.text = {}
self.state = self.default_state
-- this should enable us to pass over some characters that we want to ignore
-- for example scripts, css, etc.
self.ignored_pos = -1
for pos, ucode in utf8.codes(self.body) do
-- save buffer info and require the tokenize function
if pos > self.ignored_pos then
self.position = pos
self.codepoint = ucode
self.character = uchar(ucode)
self.state = self:tokenize(self.state) or self.state -- if tokenizer don't return new state, assume that it continues in the current state
end
end
return self:finish()
end
function HtmlParser:tokenize(state)
local state = state or self.state
local ucode = self.codepoint
local text = self.text
self.last_position = self.position
self.element_state = false
-- execute state machine object and return new state
local fn = HtmlStates[state] or function(parser) return self.default_state end
local newstate = fn(self)
-- this should enable changing state from elements that needs special treatment, like