util/xml.lua
author Kim Alvefur <zash@zash.se>
Thu, 20 Jan 2022 09:57:20 +0100
branch0.11
changeset 12205 e5e0ab93d7f4
parent 12185 783056b4e448
child 12206 ebeb4d959fb3
permissions -rw-r--r--
util.xml: Break reference to help the GC (fix #1711) LuaExpat uses a registry reference to track handlers, which makes it so that an upvalue like this creates a reference loop that keeps the parser and its handlers from being garbage collected. The same issue has affected util.xmppstream in the past. Code for checking: local xml_parse = require"util.xml".parse; for i = 1, 10000 do xml_parse("<root/>") end collectgarbage(); collectgarbage(); print(collectgarbage("count"), "KiB"); A future release of LuaExpat may fix the underlying issue there.


local st = require "util.stanza";
local lxp = require "lxp";
local t_insert = table.insert;
local t_remove = table.remove;
local error = error;

local _ENV = nil;
-- luacheck: std none

local parse_xml = (function()
	local ns_prefixes = {
		["http://www.w3.org/XML/1998/namespace"] = "xml";
	};
	local ns_separator = "\1";
	local ns_pattern = "^([^"..ns_separator.."]*)"..ns_separator.."?(.*)$";
	return function(xml, options)
		--luacheck: ignore 212/self
		local handler = {};
		local stanza = st.stanza("root");
		local namespaces = {};
		local prefixes = {};
		function handler:StartNamespaceDecl(prefix, url)
			if prefix ~= nil then
				t_insert(namespaces, url);
				t_insert(prefixes, prefix);
			end
		end
		function handler:EndNamespaceDecl(prefix)
			if prefix ~= nil then
				-- we depend on each StartNamespaceDecl having a paired EndNamespaceDecl
				t_remove(namespaces);
				t_remove(prefixes);
			end
		end
		function handler:StartElement(tagname, attr)
			local curr_ns,name = tagname:match(ns_pattern);
			if name == "" then
				curr_ns, name = "", curr_ns;
			end
			if curr_ns ~= "" then
				attr.xmlns = curr_ns;
			end
			for i=1,#attr do
				local k = attr[i];
				attr[i] = nil;
				local ns, nm = k:match(ns_pattern);
				if nm ~= "" then
					ns = ns_prefixes[ns];
					if ns then
						attr[ns..":"..nm] = attr[k];
						attr[k] = nil;
					end
				end
			end
			local n = {}
			for i=1,#namespaces do
				n[prefixes[i]] = namespaces[i];
			end
			stanza:tag(name, attr, n);
		end
		function handler:CharacterData(data)
			stanza:text(data);
		end
		function handler:EndElement()
			stanza:up();
		end
		-- SECURITY: These two handlers, especially the Doctype one, are required to prevent exploits such as Billion Laughs.
		function handler:StartDoctypeDecl()
			if not self.stop or not self:stop() then
				error("Failed to abort parsing");
			end
		end
		function handler:ProcessingInstruction()
			if not self.stop or not self:stop() then
				error("Failed to abort parsing");
			end
		end
		if not options or not options.allow_comments then
			-- NOTE: comments are generally harmless and can be useful when parsing configuration files or other data, even user-provided data
			function handler:Comment()
				if not self.stop or not self:stop() then
					error("Failed to abort parsing");
				end
			end
		end
		local parser = lxp.new(handler, ns_separator);
		local ok, err, line, col = parser:parse(xml);
		if ok then ok, err, line, col = parser:parse(); end
		--parser:close();
		if ok then
			return stanza.tags[1];
		else
			return ok, err.." (line "..line..", col "..col..")";
		end
	end;
end)();

return {
	parse = parse_xml;
};