util.format: Escape invalid UTF-8 by passing trough serialization
authorKim Alvefur <zash@zash.se>
Fri, 10 Dec 2021 22:48:45 +0100
changeset 12035 87bc26f23d9b
parent 12034 9f8206e99b89
child 12036 3db09eb4c43b
util.format: Escape invalid UTF-8 by passing trough serialization Should prevent invalid UTF-8 from making it into the logs, which can cause trouble with terminals or log viewers or other tools, such as when grep determines that log files are binary.
spec/util_format_spec.lua
util/format.lua
--- a/spec/util_format_spec.lua	Fri Dec 10 22:25:34 2021 +0100
+++ b/spec/util_format_spec.lua	Fri Dec 10 22:48:45 2021 +0100
@@ -20,5 +20,9 @@
 			assert.equal("␁", format("%s", "\1"));
 		end);
 
+		it("escapes invalid UTF-8", function ()
+			assert.equal("\"Hello w\\195rld\"", format("%s", "Hello w\195rld"));
+		end);
+
 	end);
 end);
--- a/util/format.lua	Fri Dec 10 22:25:34 2021 +0100
+++ b/util/format.lua	Fri Dec 10 22:48:45 2021 +0100
@@ -5,6 +5,7 @@
 local tostring = tostring;
 local unpack = table.unpack or unpack; -- luacheck: ignore 113/unpack
 local pack = require "util.table".pack; -- TODO table.pack in 5.2+
+local valid_utf8 = require "util.encodings".utf8.valid;
 local type = type;
 local dump = require "util.serialization".new("debug");
 local num_type = math.type or function (n)
@@ -60,10 +61,18 @@
 				args[i] = dump(arg);
 				spec = "%s";
 			elseif option == "s" then
-				args[i] = tostring(arg):gsub("[%z\1-\8\11-\31\127]", control_symbols):gsub("\n\t?", "\n\t");
+				arg = tostring(arg);
+				if arg:find("[\128-\255]") and not valid_utf8(arg) then
+					args[i] = dump(arg);
+				else
+					args[i] = arg:gsub("[%z\1-\8\11-\31\127]", control_symbols):gsub("\n\t?", "\n\t");
+				end
 			elseif type(arg) ~= "number" then -- arg isn't number as expected?
 				args[i] = tostring(arg);
 				spec = "[%s]";
+				option = "s";
+				spec = "[%s]";
+				t = "string";
 			elseif expects_integer[option] and num_type(arg) ~= "integer" then
 				args[i] = tostring(arg);
 				spec = "[%s]";