util-src/encodings.c
changeset 6607 478308ee29dd
parent 6416 a552f4170aed
parent 6595 141afe8a167b
child 6608 03a43bf3ecd2
--- a/util-src/encodings.c	Wed Jan 21 02:55:27 2015 +0100
+++ b/util-src/encodings.c	Fri Mar 27 00:27:29 2015 +0100
@@ -1,6 +1,7 @@
 /* Prosody IM
 -- Copyright (C) 2008-2010 Matthew Wild
 -- Copyright (C) 2008-2010 Waqas Hussain
+-- Copyright (C) 1994-2015 Lua.org, PUC-Rio.
 -- 
 -- This project is MIT/X11 licensed. Please see the
 -- COPYING file in the source package for more information.
@@ -120,6 +121,88 @@
 	{ NULL,		NULL	}
 };
 
+/******************* UTF-8 ********************/
+
+/*
+ * Adapted from Lua 5.3
+ * Needed because libidn does not validate that input is valid UTF-8
+ */
+
+#define MAXUNICODE	0x10FFFF
+
+/*
+ * Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
+ */
+static const char *utf8_decode (const char *o, int *val) {
+	static unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
+	const unsigned char *s = (const unsigned char *)o;
+	unsigned int c = s[0];
+	unsigned int res = 0;  /* final result */
+	if (c < 0x80)  /* ascii? */
+		res = c;
+	else {
+		int count = 0;  /* to count number of continuation bytes */
+		while (c & 0x40) {  /* still have continuation bytes? */
+			int cc = s[++count];  /* read next byte */
+			if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
+				return NULL;  /* invalid byte sequence */
+			res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
+			c <<= 1;  /* to test next bit */
+		}
+		res |= ((c & 0x7F) << (count * 5));  /* add first byte */
+		if (count > 3 || res > MAXUNICODE || res <= limits[count] || (0xd800 <= res && res <= 0xdfff) )
+			return NULL;  /* invalid byte sequence */
+		s += count;  /* skip continuation bytes read */
+	}
+	if (val) *val = res;
+	return (const char *)s + 1;  /* +1 to include first byte */
+}
+
+/*
+ * Check that a string is valid UTF-8
+ * Returns NULL if not
+ */
+const char* check_utf8 (lua_State *L, int idx, size_t *l) {
+	size_t pos, len;
+	const char *s = luaL_checklstring(L, 1, &len);
+	pos = 0;
+	while (pos <= len) {
+		const char *s1 = utf8_decode(s + pos, NULL);
+		if (s1 == NULL) {  /* conversion error? */
+			return NULL;
+		}
+		pos = s1 - s;
+	}
+	if(l != NULL) {
+		*l = len;
+	}
+	return s;
+}
+
+static int Lutf8_valid(lua_State *L) {
+	lua_pushboolean(L, check_utf8(L, 1, NULL) != NULL);
+	return 1;
+}
+
+static int Lutf8_length(lua_State *L) {
+	size_t len;
+	if(!check_utf8(L, 1, &len)) {
+		lua_pushnil(L);
+		lua_pushliteral(L, "invalid utf8");
+		return 2;
+	}
+	lua_pushinteger(L, len);
+	return 1;
+}
+
+static const luaL_Reg Reg_utf8[] =
+{
+	{ "valid",	Lutf8_valid	},
+	{ "length",	Lutf8_length	},
+	{ NULL,		NULL	}
+};
+
+
 /***************** STRINGPREP *****************/
 #ifdef USE_STRINGPREP_ICU
 
@@ -216,8 +299,8 @@
 		lua_pushnil(L);
 		return 1;
 	}
-	s = lua_tolstring(L, 1, &len);
-	if (len >= 1024) {
+	s = check_utf8(L, 1, &len);
+	if (s == NULL || len >= 1024 || len != strlen(s)) {
 		lua_pushnil(L);
 		return 1; /* TODO return error message */
 	}
@@ -324,7 +407,11 @@
 static int Lidna_to_ascii(lua_State *L)		/** idna.to_ascii(s) */
 {
 	size_t len;
-	const char *s = luaL_checklstring(L, 1, &len);
+	const char *s = check_utf8(L, 1, &len);
+	if (s == NULL || len != strlen(s)) {
+		lua_pushnil(L);
+		return 1; /* TODO return error message */
+	}
 	char* output = NULL;
 	int ret = idna_to_ascii_8z(s, &output, IDNA_USE_STD3_ASCII_RULES);
 	if (ret == IDNA_SUCCESS) {
@@ -384,6 +471,10 @@
 	luaL_register(L, NULL, Reg_idna);
 	lua_setfield(L, -2, "idna");
 
+	lua_newtable(L);
+	luaL_register(L, NULL, Reg_utf8);
+	lua_setfield(L, -2, "utf8");
+
 	lua_pushliteral(L, "-3.14");
 	lua_setfield(L, -2, "version");
 	return 1;