modules/babelizer.py
changeset 29 602b355c5e89
parent 18 3a35dd9adc73
equal deleted inserted replaced
28:f7d12d2d39bc 29:602b355c5e89
     6 """API for simple access to babelfish.altavista.com.
     6 """API for simple access to babelfish.altavista.com.
     7 
     7 
     8 Summary:
     8 Summary:
     9 
     9 
    10     import babelizer
    10     import babelizer
    11    
    11 
    12 	print ' '.join(babelizer.available_languages)
    12 	print ' '.join(babelizer.available_languages)
    13 
    13 
    14     print babelizer.translate( 'How much is that doggie in the window?',
    14     print babelizer.translate( 'How much is that doggie in the window?',
    15 		                       'English', 'French' )
    15 		                       'English', 'French' )
    16 
    16 
    17     def babel_callback(phrase):
    17     def babel_callback(phrase):
    18 		print phrase
    18 		print phrase
    19 		sys.stdout.flush()
    19 		sys.stdout.flush()
    20 		
    20 
    21 	babelizer.babelize( 'I love a reigning knight.',
    21 	babelizer.babelize( 'I love a reigning knight.',
    22 						'English', 'German',
    22 						'English', 'German',
    23 						callback = babel_callback )
    23 						callback = babel_callback )
    24 
    24 
    25 available_languages
    25 available_languages
    56 Version: $Id: babelizer.py,v 1.1.1.1 2005/09/29 21:38:49 mikem Exp $
    56 Version: $Id: babelizer.py,v 1.1.1.1 2005/09/29 21:38:49 mikem Exp $
    57 Author: Jonathan Feinberg <jdf@pobox.com>
    57 Author: Jonathan Feinberg <jdf@pobox.com>
    58 """
    58 """
    59 import re, string, urllib
    59 import re, string, urllib
    60 
    60 
       
    61 def unicode_urlencode(params):
       
    62     if isinstance(params, dict):
       
    63         params = params.items()
       
    64         return urllib.urlencode([(k, isinstance(v, unicode) and v.encode('utf-8') or v)
       
    65                                  for k, v in params])
       
    66 
       
    67 
    61 """
    68 """
    62 Various patterns I have encountered in looking for the babelfish result.
    69 Various patterns I have encountered in looking for the babelfish result.
    63 We try each of them in turn, based on the relative number of times I've
    70 We try each of them in turn, based on the relative number of times I've
    64 seen each of these patterns.  $1.00 to anyone who can provide a heuristic
    71 seen each of these patterns.  $1.00 to anyone who can provide a heuristic
    65 for knowing which one to use.   This includes AltaVista employees.
    72 for knowing which one to use.   This includes AltaVista employees.
    66 """
    73 """
    67 __where = [ re.compile(r'name=\"q\">([^<]*)'), 
    74 __where = [ re.compile(r'name=\"q\">([^<]*)'),
    68             re.compile(r'td bgcolor=white>([^<]*)'),
    75             re.compile(r'td bgcolor=white>([^<]*)'),
    69             re.compile(r'<\/strong><br>([^<]*)'),
    76             re.compile(r'<\/strong><br>([^<]*)'),
    70             re.compile(r'<Div style=padding:10px;[^>]*>([^<]*)')
    77             re.compile(r'<[Dd]iv style=padding:10px;[^>]*>([^<]*)')
    71 ]
    78 ]
    72 
    79 
    73 __languages = { 'english'    : 'en',
    80 __languages = { 'english'    : 'en',
    74                 'french'     : 'fr',
    81                 'french'     : 'fr',
    75                 'spanish'    : 'es',
    82                 'spanish'    : 'es',
   111     pass
   118     pass
   112 
   119 
   113 def clean(text):
   120 def clean(text):
   114     return ' '.join(string.replace(text.strip(), "\n", ' ').split())
   121     return ' '.join(string.replace(text.strip(), "\n", ' ').split())
   115 
   122 
   116 def translate(phrase, from_lang, to_lang):
   123 def translate(phrase, from_lang, to_lang, utf=1):
   117     phrase = clean(phrase)
   124     phrase = clean(phrase)
   118     try:
   125     try:
   119         from_code = __languages[from_lang.lower()]
   126         from_code = __languages[from_lang.lower()]
   120         to_code = __languages[to_lang.lower()]
   127         to_code = __languages[to_lang.lower()]
   121     except KeyError, lang:
   128     except KeyError, lang:
   122         raise LanguageNotAvailableError(lang)
   129         raise LanguageNotAvailableError(lang)
   123 
   130 
   124     params = urllib.urlencode( { 'doit' : 'done',
   131     if utf:
       
   132         phrase.encode('utf-8', 'replace')
       
   133 
       
   134     params = unicode_urlencode( { 'doit' : 'done',
   125                                  'tt' : 'urltext',
   135                                  'tt' : 'urltext',
   126                                  'intl' : '1',
   136                                  'intl' : '1',
   127                                  'urltext' : phrase.encode('utf-8', 'replace'),
   137                                  'urltext' : phrase,
   128                                  'lp' : from_code + '_' + to_code } )
   138                                  'lp' : from_code + '_' + to_code } )
   129     try:
   139     try:
   130         response = urllib.urlopen('http://babelfish.altavista.com/babelfish/tr', params)
   140         babel = urllib.FancyURLopener()
       
   141         babel.addheader('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
       
   142         response = babel.open('http://babelfish.altavista.com/tr', params)
   131     except IOError, what:
   143     except IOError, what:
   132         raise BabelizerIOError("Couldn't talk to server: %s" % what)
   144         raise BabelizerIOError("Couldn't talk to server: %s" % what)
   133     except:
   145     except:
   134         print "Unexpected error:", sys.exc_info()[0]
   146         print "Unexpected error:", sys.exc_info()[0]
   135 
   147 
   140     if not match: raise BabelfishChangedError("Can't recognize translated string.")
   152     if not match: raise BabelfishChangedError("Can't recognize translated string.")
   141     return clean(match.group(1))
   153     return clean(match.group(1))
   142 
   154 
   143 def babelize(phrase, from_language, through_language, limit = 12, callback = None):
   155 def babelize(phrase, from_language, through_language, limit = 12, callback = None):
   144     phrase = clean(phrase)
   156     phrase = clean(phrase)
       
   157     phrase.encode('utf-8', 'replace')
   145     seen = { phrase: 1 }
   158     seen = { phrase: 1 }
   146     if callback:
   159     if callback:
   147         callback(phrase)
   160         callback(phrase)
   148     else:
   161     else:
   149         results = [ phrase ]
   162         results = [ phrase ]
   150     flip = { from_language: through_language, through_language: from_language }
   163     flip = { from_language: through_language, through_language: from_language }
   151     next = from_language
   164     next = from_language
   152     for i in range(limit):
   165     for i in range(limit):
   153         phrase = translate(phrase, next, flip[next])
   166         phrase = translate(phrase, next, flip[next], None)
   154         if seen.has_key(phrase): break
   167         if seen.has_key(phrase): break
   155         seen[phrase] = 1
   168         seen[phrase] = 1
   156         if callback:
   169         if callback:
   157             callback(phrase)
   170             callback(phrase)
   158         else:
   171         else: