xml.lua
local utils = require 'pl.utils'
local split = utils.split;
local t_insert = table.insert;
local t_concat = table.concat;
local t_remove = table.remove;
local s_format = string.format;
local s_match = string.match;
local tostring = tostring;
local setmetatable = setmetatable;
local getmetatable = getmetatable;
local pairs = pairs;
local ipairs = ipairs;
local type = type;
local next = next;
local print = print;
local unpack = utils.unpack;
local s_gsub = string.gsub;
local s_char = string.char;
local s_find = string.find;
local os = os;
local pcall,require,io = pcall,require,io
local _M = {}
local Doc = { __type = "doc" };
Doc.__index = Doc;
function _M.new(tag, attr)
local doc = { tag = tag, attr = attr or {}, last_add = {}};
return setmetatable(doc, Doc);
end
function _M.parse(text_or_file, is_file, use_basic)
local parser,status,lom
if use_basic then parser = _M.basic_parse
else
status,lom = pcall(require,'lxp.lom')
if not status then parser = _M.basic_parse else parser = lom.parse end
end
if is_file then
local f,err = io.open(text_or_file)
if not f then return nil,err end
text_or_file = f:read '*a'
f:close()
end
local doc,err = parser(text_or_file)
if not doc then return nil,err end
if lom then
_M.walk(doc,false,function(_,d)
setmetatable(d,Doc)
end)
end
return doc
end
function Doc:addtag(tag, attrs)
local s = _M.new(tag, attrs);
(self.last_add[#self.last_add] or self):add_direct_child(s);
t_insert(self.last_add, s);
return self;
end
function Doc:text(text)
(self.last_add[#self.last_add] or self):add_direct_child(text);
return self;
end
function Doc:up()
t_remove(self.last_add);
return self;
end
function Doc:reset()
local last_add = self.last_add;
for i = 1,#last_add do
last_add[i] = nil;
end
return self;
end
function Doc:add_direct_child(child)
t_insert(self, child);
end
function Doc:add_child(child)
(self.last_add[#self.last_add] or self):add_direct_child(child);
return self;
end
function Doc:set_attribs (t)
for k,v in pairs(t) do
self.attr[k] = v
end
end
function Doc:set_attrib(a,v)
self.attr[a] = v
end
function Doc:get_attribs()
return self.attr
end
local function is_text(s) return type(s) == 'string' end
function _M.elem(tag,items)
local s = _M.new(tag)
if is_text(items) then items = {items} end
if _M.is_tag(items) then
t_insert(s,items)
elseif type(items) == 'table' then
for k,v in pairs(items) do
if is_text(k) then
s.attr[k] = v
t_insert(s.attr,k)
else
s[k] = v
end
end
end
return s
end
function _M.tags(list)
local ctors = {}
local elem = _M.elem
if is_text(list) then list = split(list,'%s*,%s*') end
for _,tag in ipairs(list) do
local ctor = function(items) return _M.elem(tag,items) end
t_insert(ctors,ctor)
end
return unpack(ctors)
end
local templ_cache = {}
local function template_cache (templ)
if is_text(templ) then
if templ_cache[templ] then
templ = templ_cache[templ]
else
local str,err = templ
templ,err = _M.parse(str,false,true)
if not templ then return nil,err end
templ_cache[str] = templ
end
elseif not _M.is_tag(templ) then
return nil, "template is not a document"
end
return templ
end
local function is_data(data)
return #data == 0 or type(data[1]) ~= 'table'
end
local function prepare_data(data)
for i,v in ipairs(data) do
data[tostring(i)] = v
end
end
function Doc.subst(templ, data)
local err
if type(data) ~= 'table' or not next(data) then return nil, "data must be a non-empty table" end
if is_data(data) then
prepare_data(data)
end
templ,err = template_cache(templ)
if err then return nil, err end
local function _subst(item)
return _M.clone(templ,function(s)
return s:gsub('%$(%w+)',item)
end)
end
if is_data(data) then return _subst(data) end
local list = {}
for _,item in ipairs(data) do
prepare_data(item)
t_insert(list,_subst(item))
end
if data.tag then
list = _M.elem(data.tag,list)
end
return list
end
function Doc:child_with_name(tag)
for _, child in ipairs(self) do
if child.tag == tag then return child; end
end
end
local _children_with_name
function _children_with_name(self,tag,list,recurse)
for _, child in ipairs(self) do if type(child) == 'table' then
if child.tag == tag then t_insert(list,child) end
if recurse then _children_with_name(child,tag,list,recurse) end
end end
end
function Doc:get_elements_with_name(tag,dont_recurse)
local res = {}
_children_with_name(self,tag,res,not dont_recurse)
return res
end
function Doc:children()
local i = 0;
return function (a)
i = i + 1
return a[i];
end, self, i;
end
function Doc:first_childtag()
if #self == 0 then return end
for _,t in ipairs(self) do
if type(t) == 'table' then return t end
end
end
function Doc:matching_tags(tag, xmlns)
xmlns = xmlns or self.attr.xmlns;
local tags = self;
local start_i, max_i, v = 1, #tags;
return function ()
for i=start_i,max_i do
v = tags[i];
if (not tag or v.tag == tag)
and (not xmlns or xmlns == v.attr.xmlns) then
start_i = i+1;
return v;
end
end
end, tags, start_i;
end
function Doc:childtags()
local i = 0;
return function (a)
local v
repeat
i = i + 1
v = self[i]
if v and type(v) == 'table' then return v; end
until not v
end, self[1], i;
end
function Doc:maptags(callback)
local is_tag = _M.is_tag
local i = 1;
while i <= #self do
if is_tag(self[i]) then
local ret = callback(self[i]);
if ret == nil then
t_remove(self, i);
else
self[i] = ret;
i = i + 1;
end
end
end
return self;
end
local xml_escape
do
local escape_table = { ["'"] = "'", ["\""] = """, ["<"] = "<", [">"] = ">", ["&"] = "&" };
function xml_escape(str) return (s_gsub(str, "['&<>\"]", escape_table)); end
_M.xml_escape = xml_escape;
end
local function _dostring(t, buf, self, xml_escape, parentns, idn, indent, attr_indent)
local nsid = 0;
local tag = t.tag
local lf,alf = ""," "
if indent then lf = '\n'..idn end
if attr_indent then alf = '\n'..idn..attr_indent end
t_insert(buf, lf.."<"..tag);
local function write_attr(k,v)
if s_find(k, "\1", 1, true) then
local ns, attrk = s_match(k, "^([^\1]*)\1?(.*)$");
nsid = nsid + 1;
t_insert(buf, " xmlns:ns"..nsid.."='"..xml_escape(ns).."' ".."ns"..nsid..":"..attrk.."='"..xml_escape(v).."'");
elseif not(k == "xmlns" and v == parentns) then
t_insert(buf, alf..k.."='"..xml_escape(v).."'");
end
end
if #t.attr > 0 then
for _,k in ipairs(t.attr) do
write_attr(k,t.attr[k])
end
else
for k, v in pairs(t.attr) do
write_attr(k,v)
end
end
local len,has_children = #t;
if len == 0 then
local out = "/>"
if attr_indent then out = '\n'..idn..out end
t_insert(buf, out);
else
t_insert(buf, ">");
for n=1,len do
local child = t[n];
if child.tag then
self(child, buf, self, xml_escape, t.attr.xmlns,idn and idn..indent, indent, attr_indent );
has_children = true
else t_insert(buf, xml_escape(child));
end
end
t_insert(buf, (has_children and lf or '').."</"..tag..">");
end
end
function _M.tostring(t,idn,indent, attr_indent, xml)
local buf = {};
if xml then buf[1] = "<?xml version='1.0'?>" end
_dostring(t, buf, _dostring, xml_escape, nil,idn,indent, attr_indent);
return t_concat(buf);
end
Doc.__tostring = _M.tostring
function Doc:get_text()
local res = {}
for i,el in ipairs(self) do
if is_text(el) then t_insert(res,el) end
end
return t_concat(res);
end
function _M.clone(doc, strsubst)
local lookup_table = {};
local function _copy(object,kind,parent)
if type(object) ~= "table" then
if strsubst and is_text(object) then return strsubst(object,kind,parent)
else return object
end
elseif lookup_table[object] then
return lookup_table[object]
end
local new_table = {};
lookup_table[object] = new_table
local tag = object.tag
new_table.tag = _copy(tag,'*TAG',parent)
if object.attr then
local res = {}
for attr,value in pairs(object.attr) do
res[attr] = _copy(value,attr,object)
end
new_table.attr = res
end
for index = 1,#object do
local v = _copy(object[index],'*TEXT',object)
t_insert(new_table,v)
end
return setmetatable(new_table, getmetatable(object))
end
return _copy(doc)
end
Doc.filter = _M.clone
function _M.compare(t1,t2)
local ty1 = type(t1)
local ty2 = type(t2)
if ty1 ~= ty2 then return false, 'type mismatch' end
if ty1 == 'string' then
return t1 == t2 and true or 'text '..t1..' ~= text '..t2
end
if ty1 ~= 'table' or ty2 ~= 'table' then return false, 'not a document' end
if t1.tag ~= t2.tag then return false, 'tag '..t1.tag..' ~= tag '..t2.tag end
if #t1 ~= #t2 then return false, 'size '..#t1..' ~= size '..#t2..' for tag '..t1.tag end
for k,v in pairs(t1.attr) do
if t2.attr[k] ~= v then return false, 'mismatch attrib' end
end
for k,v in pairs(t2.attr) do
if t1.attr[k] ~= v then return false, 'mismatch attrib' end
end
for i = 1,#t1 do
local yes,err = _M.compare(t1[i],t2[i])
if not yes then return err end
end
return true
end
function _M.is_tag(d)
return type(d) == 'table' and is_text(d.tag)
end
function _M.walk (doc, depth_first, operation)
if not depth_first then operation(doc.tag,doc) end
for _,d in ipairs(doc) do
if _M.is_tag(d) then
_M.walk(d,depth_first,operation)
end
end
if depth_first then operation(doc.tag,doc) end
end
local html_empty_elements = { br = true,
img = true,
meta = true,
frame = true,
area = true,
hr = true,
base = true,
col = true,
link = true,
input = true,
option = true,
param = true,
isindex = true,
embed = true,
}
local escapes = { quot = "\"", apos = "'", lt = "<", gt = ">", amp = "&" }
local function unescape(str) return (str:gsub( "&(%a+);", escapes)); end
function _M.parsehtml (s)
return _M.basic_parse(s,false,true)
end
function _M.basic_parse(s,all_text,html)
local t_insert,t_remove = table.insert,table.remove
local s_find,s_sub = string.find,string.sub
local stack = {}
local top = {}
local function parseargs(s)
local arg = {}
s:gsub("([%w:]+)%s*=%s*([\"'])(.-)%2", function (w, _, a)
if html then w = w:lower() end
arg[w] = unescape(a)
end)
if html then
s:gsub("([%w:]+)%s*=%s*([^\"']+)%s*", function (w, a)
w = w:lower()
arg[w] = unescape(a)
end)
end
return arg
end
t_insert(stack, top)
local ni,c,label,xarg, empty, _, istart
local i, j = 1, 1
if not html then _,istart = s_find(s,'^%s*<%?[^%?]+%?>%s*')
else _,istart = s_find(s,'^%s*<!DOCTYPE.->%s*')
end
if istart then i = istart+1 end
while true do
ni,j,c,label,xarg, empty = s_find(s, "<([%/!]?)([%w:%-_]+)(.-)(%/?)>", i)
if not ni then break end
if c == "!" then if not (label:match '%-%-$' and xarg == '') then
if xarg:match '%-%-$' then j = j - 2
end
_,j = s_find(s, "-->", j, true)
end
else
local text = s_sub(s, i, ni-1)
if html then
label = label:lower()
if html_empty_elements[label] then empty = "/" end
if label == 'script' then
end
end
if all_text or not s_find(text, "^%s*$") then
t_insert(top, unescape(text))
end
if empty == "/" then t_insert(top, setmetatable({tag=label, attr=parseargs(xarg), empty=1},Doc))
elseif c == "" then top = setmetatable({tag=label, attr=parseargs(xarg)},Doc)
t_insert(stack, top) else local toclose = t_remove(stack) top = stack[#stack]
if #stack < 1 then
error("nothing to close with "..label..':'..text)
end
if toclose.tag ~= label then
error("trying to close "..toclose.tag.." with "..label.." "..text)
end
t_insert(top, toclose)
end
end
i = j+1
end
local text = s_sub(s, i)
if all_text or not s_find(text, "^%s*$") then
t_insert(stack[#stack], unescape(text))
end
if #stack > 1 then
error("unclosed "..stack[#stack].tag)
end
local res = stack[1]
return is_text(res[1]) and res[2] or res[1]
end
local function empty(attr) return not attr or not next(attr) end
local function is_element(d) return type(d) == 'table' and d.tag ~= nil end
local function has_one_element(t)
local key,value = next(t)
if next(t,key) ~= nil then return false end
return key,value
end
local function append_capture(res,tbl)
if not empty(tbl) then local key
if tbl._ then key = tbl._
tbl._ = nil
if empty(tbl) then return end
end
local numkey,val = has_one_element(tbl)
if numkey == 0 then tbl = val end
if key then
res[key] = tbl
else t_insert(res,tbl)
end
end
end
local function make_number(pat)
if pat:find '^%d+$' then pat = tonumber(pat)
end
return pat
end
local function capture_attrib(res,pat,value)
pat = make_number(pat:sub(2))
res[pat] = value
return true
end
local match
function match(d,pat,res,keep_going)
local ret = true
if d == nil then d = '' end if is_text(d) then
if not is_text(pat) then return false end
if _M.debug then print(d,pat) end
if pat:find '^%$' then
return capture_attrib(res,pat,d)
else
return d == pat
end
else
if _M.debug then print(d.tag,pat.tag) end
local tagpat = pat.tag:match '^(.-)%-$'
if tagpat then
tagpat = make_number(tagpat)
res[tagpat] = d.tag
end
if d.tag == pat.tag or tagpat then
if not empty(pat.attr) then
if empty(d.attr) then ret = false
else
for prop,pval in pairs(pat.attr) do
local dval = d.attr[prop]
if not match(dval,pval,res) then ret = false; break end
end
end
end
if ret and #pat > 0 then
local i,j = 1,1
local function next_elem()
j = j + 1 if is_text(d[j]) then j = j + 1 end
return j <= #d
end
repeat
local p = pat[i]
if is_element(p) and p.repeated then
local found
repeat
local tbl = {}
ret = match(d[j],p,tbl,false)
if ret then
found = false append_capture(res,tbl)
end
until not next_elem() or (found and not ret)
i = i + 1
else
ret = match(d[j],p,res,false)
if ret then i = i + 1 end
end
until not next_elem() or i > #pat if i > #pat then return true end
end
if ret then return true end
else
ret = false
end
if keep_going then
for child in d:childtags() do
ret = match(child,pat,res,keep_going)
if ret then break end
end
end
end
return ret
end
function Doc:match(pat)
local err
pat,err = template_cache(pat)
if not pat then return nil, err end
_M.walk(pat,false,function(_,d)
if is_text(d[1]) and is_element(d[2]) and is_text(d[3]) and
d[1]:find '%s*{{' and d[3]:find '}}%s*' then
t_remove(d,1)
t_remove(d,2)
d[1].repeated = true
end
end)
local res = {}
local ret = match(self,pat,res,true)
return res,ret
end
return _M