Babel: Try to fix UTF issues
authorMikael Berthe <mikael@lilotux.net>
Wed, 02 May 2007 23:53:08 +0200
changeset 29 602b355c5e89
parent 28 f7d12d2d39bc
child 30 33af762962f1
Babel: Try to fix UTF issues This patch [1] helped me a lot: [1] http://ejabberd.jabber.ru/node/1185
modules/babelizer.py
plugins/babel_plugin.py
--- a/modules/babelizer.py	Wed May 02 22:38:53 2007 +0200
+++ b/modules/babelizer.py	Wed May 02 23:53:08 2007 +0200
@@ -8,7 +8,7 @@
 Summary:
 
     import babelizer
-   
+
 	print ' '.join(babelizer.available_languages)
 
     print babelizer.translate( 'How much is that doggie in the window?',
@@ -17,7 +17,7 @@
     def babel_callback(phrase):
 		print phrase
 		sys.stdout.flush()
-		
+
 	babelizer.babelize( 'I love a reigning knight.',
 						'English', 'German',
 						callback = babel_callback )
@@ -58,16 +58,23 @@
 """
 import re, string, urllib
 
+def unicode_urlencode(params):
+    if isinstance(params, dict):
+        params = params.items()
+        return urllib.urlencode([(k, isinstance(v, unicode) and v.encode('utf-8') or v)
+                                 for k, v in params])
+
+
 """
 Various patterns I have encountered in looking for the babelfish result.
 We try each of them in turn, based on the relative number of times I've
 seen each of these patterns.  $1.00 to anyone who can provide a heuristic
 for knowing which one to use.   This includes AltaVista employees.
 """
-__where = [ re.compile(r'name=\"q\">([^<]*)'), 
+__where = [ re.compile(r'name=\"q\">([^<]*)'),
             re.compile(r'td bgcolor=white>([^<]*)'),
             re.compile(r'<\/strong><br>([^<]*)'),
-            re.compile(r'<Div style=padding:10px;[^>]*>([^<]*)')
+            re.compile(r'<[Dd]iv style=padding:10px;[^>]*>([^<]*)')
 ]
 
 __languages = { 'english'    : 'en',
@@ -113,7 +120,7 @@
 def clean(text):
     return ' '.join(string.replace(text.strip(), "\n", ' ').split())
 
-def translate(phrase, from_lang, to_lang):
+def translate(phrase, from_lang, to_lang, utf=1):
     phrase = clean(phrase)
     try:
         from_code = __languages[from_lang.lower()]
@@ -121,13 +128,18 @@
     except KeyError, lang:
         raise LanguageNotAvailableError(lang)
 
-    params = urllib.urlencode( { 'doit' : 'done',
+    if utf:
+        phrase.encode('utf-8', 'replace')
+
+    params = unicode_urlencode( { 'doit' : 'done',
                                  'tt' : 'urltext',
                                  'intl' : '1',
-                                 'urltext' : phrase.encode('utf-8', 'replace'),
+                                 'urltext' : phrase,
                                  'lp' : from_code + '_' + to_code } )
     try:
-        response = urllib.urlopen('http://babelfish.altavista.com/babelfish/tr', params)
+        babel = urllib.FancyURLopener()
+        babel.addheader('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
+        response = babel.open('http://babelfish.altavista.com/tr', params)
     except IOError, what:
         raise BabelizerIOError("Couldn't talk to server: %s" % what)
     except:
@@ -142,6 +154,7 @@
 
 def babelize(phrase, from_language, through_language, limit = 12, callback = None):
     phrase = clean(phrase)
+    phrase.encode('utf-8', 'replace')
     seen = { phrase: 1 }
     if callback:
         callback(phrase)
@@ -150,7 +163,7 @@
     flip = { from_language: through_language, through_language: from_language }
     next = from_language
     for i in range(limit):
-        phrase = translate(phrase, next, flip[next])
+        phrase = translate(phrase, next, flip[next], None)
         if seen.has_key(phrase): break
         seen[phrase] = 1
         if callback:
--- a/plugins/babel_plugin.py	Wed May 02 22:38:53 2007 +0200
+++ b/plugins/babel_plugin.py	Wed May 02 23:53:08 2007 +0200
@@ -58,7 +58,7 @@
 			results = babelizer.babelize(body, from_lang, through_lang)
 			reply = ''
 			for result in results:
-				reply += '\n' + result
+				reply += '\n' + result.decode('utf-8', 'replace')
 		except babelizer.LanguageNotAvailableError:
 			reply = 'Invalid Language'
 		except babelizer.BabelfishChangedError: