--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_export_skeletons/README.md Thu Dec 09 23:48:25 2021 +0100
@@ -0,0 +1,41 @@
+---
+summary: Export message archives in sanitized minimal form for analysis
+---
+
+Exports message archives in a format stripped from private information
+and message content.
+
+# Usage
+
+ prosodyctl mod_export_skeletons [options] user@host*
+
+Multiple user JIDs can be given.
+
+Some storage drivers such as [SQL][doc:modules:mod_storage_sql] allows
+exporting all users at once by giving the special username `*`, i.e.
+`prosodyctl mod_export_skeletons \*@example.com`.
+
+`--start=timestamp`
+: Start of time span to export in [XEP-0082] format
+
+`--end=timestamp`
+: End of time span to export in [XEP-0082] format
+
+# Output
+
+All content is stripped, leaving only the basic XML structure, with
+child tags sorted.
+
+Top level attributes are given special treatment since they carry
+protocol semantics. Notably the `@to` and `@from` JIDs are replaced by
+symbolic labels to convey what form (bare, full or host) they had. The
+`@id` attribute is replaced with a string of the same length.
+
+## Example
+
+```xml
+<message from='full' id='xxxxxxxxxxxxxxxx' type='chat' to='bare'><body/><x xmlns='jabber:x:oob'><url/></x></message>
+<message from='bare' id='xxxxxxxxxxxxxxxx' type='error' to='full'><error><remote-server-not-found xmlns='urn:ietf:params:xml:ns:xmpp-stanzas'/><text xmlns='urn:ietf:params:xml:ns:xmpp-stanzas'/></error></message>
+<message from='full' id='xxxxxxxxxxxxxxxx' type='chat' to='bare'><body/><x xmlns='jabber:x:oob'><url/></x></message>
+<message from='full' id='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' type='normal' to='bare'><x xmlns='jabber:x:conference'/></message>
+```
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_export_skeletons/mod_export_skeletons.lua Thu Dec 09 23:48:25 2021 +0100
@@ -0,0 +1,71 @@
+
+local t_insert = table.insert;
+local t_sort = table.sort;
+
+local sm = require "core.storagemanager";
+local um = require "core.usermanager";
+
+local argparse = require "util.argparse";
+local dt = require "util.datetime";
+local jid = require "util.jid";
+local st = require "util.stanza";
+
+local function skeleton(s)
+ local o = st.stanza(s.name, { xmlns = s.attr.xmlns });
+
+ local children = {};
+ for _, child in ipairs(s.tags) do t_insert(children, skeleton(child)) end
+ t_sort(children, function(a, b)
+ if a.attr.xmlns == b.attr.xmlns then return a.name < b.name; end
+ return (a.attr.xmlns or "") < (b.attr.xmlns or "");
+ end);
+ for _, child in ipairs(children) do o:add_direct_child(child); end
+ return o;
+end
+
+local function classify_jid(s)
+ if not s then return "" end
+ local u, h, r = jid.split(s);
+ if r then
+ return "full"
+ elseif u then
+ return "bare"
+ elseif h then
+ return "host"
+ else
+ return "invalid"
+ end
+end
+
+function module.command(arg)
+ local opts = argparse.parse(arg, { value_params = { store = true; with = true; start = true; ["end"] = true } });
+ local store = opts.store or "archive"; -- so you can pass 'archive2'
+ opts.store = nil;
+ local query = { with = jid.prep(opts.with); start = dt.parse(opts.start); ["end"] = dt.parse(opts["end"]) };
+ local host_initialized = {};
+ for _, export_jid in ipairs(arg) do
+
+ local username, host = jid.split(export_jid);
+ if not host_initialized[host] then
+ sm.initialize_host(host);
+ um.initialize_host(host);
+ host_initialized[host] = true;
+ end
+
+ local archive = module:context(host):open_store(store, "archive");
+ local iter, total = assert(archive:find(username ~= "*" and username, query))
+ if total then io.stderr:write(string.format("Processing %d entries\n", total)); end
+ for _, item in iter do
+ local clean = skeleton(item);
+
+ -- Normalize top level attributes
+ clean.attr.type = item.attr.type;
+ if clean.attr.type == nil and clean.name == "message" then clean.attr.type = "normal"; end
+ clean.attr.id = string.rep("x", #(item.attr.id or "")); -- worth rounding to nearest power of two or so?
+ clean.attr.from = classify_jid(item.attr.from);
+ clean.attr.to = classify_jid(item.attr.to);
+ print(clean);
+ end
+
+ end
+end