modules/babelizer.py
changeset 0 93b25987d3e5
child 18 3a35dd9adc73
equal deleted inserted replaced
-1:000000000000 0:93b25987d3e5
       
     1 # babelizer.py - API for simple access to babelfish.altavista.com.
       
     2 #                Requires python 2.0 or better.
       
     3 #
       
     4 # See it in use at http://babel.MrFeinberg.com/
       
     5 
       
     6 """API for simple access to babelfish.altavista.com.
       
     7 
       
     8 Summary:
       
     9 
       
    10     import babelizer
       
    11    
       
    12 	print ' '.join(babelizer.available_languages)
       
    13 
       
    14     print babelizer.translate( 'How much is that doggie in the window?',
       
    15 		                       'English', 'French' )
       
    16 
       
    17     def babel_callback(phrase):
       
    18 		print phrase
       
    19 		sys.stdout.flush()
       
    20 		
       
    21 	babelizer.babelize( 'I love a reigning knight.',
       
    22 						'English', 'German',
       
    23 						callback = babel_callback )
       
    24 
       
    25 available_languages
       
    26     A list of languages available for use with babelfish.
       
    27 
       
    28 translate( phrase, from_lang, to_lang )
       
    29     Uses babelfish to translate phrase from from_lang to to_lang.
       
    30 
       
    31 babelize(phrase, from_lang, through_lang, limit = 12, callback = None)
       
    32     Uses babelfish to translate back and forth between from_lang and
       
    33     through_lang until either no more changes occur in translation or
       
    34     limit iterations have been reached, whichever comes first.  Takes
       
    35     an optional callback function which should receive a single
       
    36     parameter, being the next translation.  Without the callback
       
    37     returns a list of successive translations.
       
    38 
       
    39 It's only guaranteed to work if 'english' is one of the two languages
       
    40 given to either of the translation methods.
       
    41 
       
    42 Both translation methods throw exceptions which are all subclasses of
       
    43 BabelizerError.  They include
       
    44 
       
    45 LanguageNotAvailableError
       
    46     Thrown on an attempt to use an unknown language.
       
    47 
       
    48 BabelfishChangedError
       
    49     Thrown when babelfish.altavista.com changes some detail of their
       
    50     layout, and babelizer can no longer parse the results or submit
       
    51     the correct form (a not infrequent occurance).
       
    52 
       
    53 BabelizerIOError
       
    54     Thrown for various networking and IO errors.
       
    55 
       
    56 Version: $Id: babelizer.py,v 1.1.1.1 2005/09/29 21:38:49 mikem Exp $
       
    57 Author: Jonathan Feinberg <jdf@pobox.com>
       
    58 """
       
    59 import re, string, urllib
       
    60 
       
    61 """
       
    62 Various patterns I have encountered in looking for the babelfish result.
       
    63 We try each of them in turn, based on the relative number of times I've
       
    64 seen each of these patterns.  $1.00 to anyone who can provide a heuristic
       
    65 for knowing which one to use.   This includes AltaVista employees.
       
    66 """
       
    67 __where = [ re.compile(r'name=\"q\">([^<]*)'), 
       
    68             re.compile(r'td bgcolor=white>([^<]*)'),
       
    69             re.compile(r'<\/strong><br>([^<]*)'),
       
    70             re.compile(r'<Div style=padding:10px;[^>]*>([^<]*)')
       
    71 ]
       
    72 
       
    73 __languages = { 'english'    : 'en',
       
    74                 'french'     : 'fr',
       
    75                 'spanish'    : 'es',
       
    76                 'german'     : 'de',
       
    77                 'italian'    : 'it',
       
    78                 'portuguese' : 'pt',
       
    79                 'russian'    : 'ru',
       
    80                 'korean'     : 'ko',
       
    81                 'chinese'    : 'zh',
       
    82                 'japanese'   : 'ja',
       
    83                 'en'         : 'en',
       
    84                 'fr'         : 'fr',
       
    85                 'es'         : 'es',
       
    86                 'de'         : 'de',
       
    87                 'it'         : 'it',
       
    88                 'pt'         : 'pt',
       
    89                 'ru'         : 'ru',
       
    90                 'ko'         : 'ko',
       
    91                 'zh'         : 'zh',
       
    92                 'ja'         : 'ja',
       
    93            }
       
    94 
       
    95 """
       
    96   All of the available language names.
       
    97 """
       
    98 available_languages = [ x.title() for x in __languages.keys() ]
       
    99 
       
   100 """
       
   101   Calling translate() or babelize() can raise a BabelizerError
       
   102 """
       
   103 class BabelizerError(Exception):
       
   104     pass
       
   105 
       
   106 class LanguageNotAvailableError(BabelizerError):
       
   107     pass
       
   108 class BabelfishChangedError(BabelizerError):
       
   109     pass
       
   110 class BabelizerIOError(BabelizerError):
       
   111     pass
       
   112 
       
   113 def clean(text):
       
   114     return ' '.join(string.replace(text.strip(), "\n", ' ').split())
       
   115 
       
   116 def translate(phrase, from_lang, to_lang):
       
   117     phrase = clean(phrase)
       
   118     try:
       
   119         from_code = __languages[from_lang.lower()]
       
   120         to_code = __languages[to_lang.lower()]
       
   121     except KeyError, lang:
       
   122         raise LanguageNotAvailableError(lang)
       
   123 
       
   124     params = urllib.urlencode( { 'doit' : 'done',
       
   125                                  'tt' : 'urltext',
       
   126                                  'intl' : '1',
       
   127                                  'urltext' : phrase.encode('utf-8', 'replace'),
       
   128                                  'lp' : from_code + '_' + to_code } )
       
   129     try:
       
   130         response = urllib.urlopen('http://babelfish.altavista.com/babelfish/tr', params)
       
   131     except IOError, what:
       
   132         raise BabelizerIOError("Couldn't talk to server: %s" % what)
       
   133     except:
       
   134         print "Unexpected error:", sys.exc_info()[0]
       
   135 
       
   136     html = response.read()
       
   137     for regex in __where:
       
   138         match = regex.search(html)
       
   139         if match: break
       
   140     if not match: raise BabelfishChangedError("Can't recognize translated string.")
       
   141     return clean(match.group(1))
       
   142 
       
   143 def babelize(phrase, from_language, through_language, limit = 12, callback = None):
       
   144     phrase = clean(phrase)
       
   145     seen = { phrase: 1 }
       
   146     if callback:
       
   147         callback(phrase)
       
   148     else:
       
   149         results = [ phrase ]
       
   150     flip = { from_language: through_language, through_language: from_language }
       
   151     next = from_language
       
   152     for i in range(limit):
       
   153         phrase = translate(phrase, next, flip[next])
       
   154         if seen.has_key(phrase): break
       
   155         seen[phrase] = 1
       
   156         if callback:
       
   157             callback(phrase)
       
   158         else:
       
   159             results.append(phrase)
       
   160         next = flip[next]
       
   161     if not callback: return results
       
   162 
       
   163 if __name__ == '__main__':
       
   164     import sys
       
   165     def printer(x):
       
   166         print x
       
   167         sys.stdout.flush();
       
   168 
       
   169     
       
   170     babelize("I won't take that sort of treatment from you, or from your doggie!",
       
   171              'english', 'french', callback = printer)