loudmouth/lm-parser.c
changeset 622 2fd864a27f9b
parent 607 e2bd14357c9a
child 623 d20813955632
--- a/loudmouth/lm-parser.c	Mon Oct 03 17:03:04 2011 +0200
+++ b/loudmouth/lm-parser.c	Mon Oct 03 17:06:18 2011 +0200
@@ -43,6 +43,8 @@
 
     GMarkupParser           *m_parser;
     GMarkupParseContext     *context;
+    gchar                   *incomplete; /* incomplete utf-8 character
+                                            found at the end of buffer */
 };
 
 
@@ -200,7 +202,7 @@
     g_return_if_fail (user_data != NULL);
     g_return_if_fail (error != NULL);
 
-    g_log (LM_LOG_DOMAIN, LM_LOG_LEVEL_PARSER,
+    g_log (LM_LOG_DOMAIN, LM_LOG_LEVEL_VERBOSE,
            "Parsing failed: %s\n", error->message);
 }
 
@@ -237,27 +239,98 @@
     parser->cur_root = NULL;
     parser->cur_node = NULL;
 
+    parser->incomplete = NULL;
+
     return parser;
 }
 
+static gchar *
+_lm_parser_make_valid (const gchar *buffer, gchar **incomplete)
+{
+    GString *string;
+    const gchar *remainder, *invalid;
+    gint remaining_bytes, valid_bytes;
+    gunichar code; /*error code for invalid character*/
+
+    g_return_val_if_fail (buffer != NULL, NULL);
+
+    string = NULL;
+    remainder = buffer;
+    remaining_bytes = strlen (buffer);
+
+    while (remaining_bytes != 0)
+    {
+        if (g_utf8_validate (remainder, remaining_bytes, &invalid))
+            break;
+        valid_bytes = invalid - remainder;
+
+        if (string == NULL)
+            string = g_string_sized_new (remaining_bytes);
+
+        g_string_append_len (string, remainder, valid_bytes);
+
+        remainder = g_utf8_find_next_char(invalid, NULL);
+        remaining_bytes -= valid_bytes + (remainder - invalid);
+
+        code = g_utf8_get_char_validated (invalid, -1);
+
+        if (code == -1) {
+            /* A complete but invalid codepoint */
+            /* append U+FFFD REPLACEMENT CHARACTER */
+            g_string_append (string, "\357\277\275");
+            g_log (LM_LOG_DOMAIN, LM_LOG_LEVEL_VERBOSE, "invalid character!\n");
+        } else if (code == -2) {
+            /* Beginning of what could be a character */
+            *incomplete = g_strdup (invalid);
+            g_log (LM_LOG_DOMAIN, LM_LOG_LEVEL_VERBOSE,
+                           "incomplete character: %s\n", *incomplete);
+
+            g_assert (remaining_bytes == 0);
+            g_assert (*(g_utf8_find_next_char(invalid, NULL)) == '\0');
+        }
+    }
+
+    if (string == NULL)
+        return g_strdup (buffer);
+
+    g_string_append (string, remainder);
+
+    g_assert (g_utf8_validate (string->str, -1, NULL));
+
+    return g_string_free (string, FALSE);
+}
+
+
 gboolean
 lm_parser_parse (LmParser *parser, const gchar *string)
 {
+    gboolean parsed;
+    gchar *valid, *completed;
     g_return_val_if_fail (parser != NULL, FALSE);
 
     if (!parser->context) {
         parser->context = g_markup_parse_context_new (parser->m_parser, 0,
                                                       parser, NULL);
     }
-
-    if (g_markup_parse_context_parse (parser->context, string,
-                                      (gssize)strlen (string), NULL)) {
-        return TRUE;
+    if (parser->incomplete) {
+        completed = g_strdup_printf("%s%s", parser->incomplete, string);
+        g_free(parser->incomplete);
+        parser->incomplete = NULL;
+    } else {
+        completed = g_strdup(string);
+    }
+    valid = _lm_parser_make_valid (completed, &parser->incomplete);
+    g_free(completed);
+    if (g_markup_parse_context_parse (parser->context, valid,
+                                      (gssize)strlen (valid), NULL)) {
+        parsed = TRUE;
     } else {
         g_markup_parse_context_free (parser->context);
         parser->context = NULL;
-        return FALSE;
+        parsed = FALSE;
     }
+    g_free(valid);
+    return parsed;
 }
 
 void
@@ -270,6 +343,7 @@
     if (parser->context) {
         g_markup_parse_context_free (parser->context);
     }
+    g_free (parser->incomplete);
     g_free (parser->m_parser);
     g_free (parser);
 }