6 """API for simple access to babelfish.altavista.com. |
6 """API for simple access to babelfish.altavista.com. |
7 |
7 |
8 Summary: |
8 Summary: |
9 |
9 |
10 import babelizer |
10 import babelizer |
11 |
11 |
12 print ' '.join(babelizer.available_languages) |
12 print ' '.join(babelizer.available_languages) |
13 |
13 |
14 print babelizer.translate( 'How much is that doggie in the window?', |
14 print babelizer.translate( 'How much is that doggie in the window?', |
15 'English', 'French' ) |
15 'English', 'French' ) |
16 |
16 |
17 def babel_callback(phrase): |
17 def babel_callback(phrase): |
18 print phrase |
18 print phrase |
19 sys.stdout.flush() |
19 sys.stdout.flush() |
20 |
20 |
21 babelizer.babelize( 'I love a reigning knight.', |
21 babelizer.babelize( 'I love a reigning knight.', |
22 'English', 'German', |
22 'English', 'German', |
23 callback = babel_callback ) |
23 callback = babel_callback ) |
24 |
24 |
25 available_languages |
25 available_languages |
56 Version: $Id: babelizer.py,v 1.1.1.1 2005/09/29 21:38:49 mikem Exp $ |
56 Version: $Id: babelizer.py,v 1.1.1.1 2005/09/29 21:38:49 mikem Exp $ |
57 Author: Jonathan Feinberg <jdf@pobox.com> |
57 Author: Jonathan Feinberg <jdf@pobox.com> |
58 """ |
58 """ |
59 import re, string, urllib |
59 import re, string, urllib |
60 |
60 |
|
61 def unicode_urlencode(params): |
|
62 if isinstance(params, dict): |
|
63 params = params.items() |
|
64 return urllib.urlencode([(k, isinstance(v, unicode) and v.encode('utf-8') or v) |
|
65 for k, v in params]) |
|
66 |
|
67 |
61 """ |
68 """ |
62 Various patterns I have encountered in looking for the babelfish result. |
69 Various patterns I have encountered in looking for the babelfish result. |
63 We try each of them in turn, based on the relative number of times I've |
70 We try each of them in turn, based on the relative number of times I've |
64 seen each of these patterns. $1.00 to anyone who can provide a heuristic |
71 seen each of these patterns. $1.00 to anyone who can provide a heuristic |
65 for knowing which one to use. This includes AltaVista employees. |
72 for knowing which one to use. This includes AltaVista employees. |
66 """ |
73 """ |
67 __where = [ re.compile(r'name=\"q\">([^<]*)'), |
74 __where = [ re.compile(r'name=\"q\">([^<]*)'), |
68 re.compile(r'td bgcolor=white>([^<]*)'), |
75 re.compile(r'td bgcolor=white>([^<]*)'), |
69 re.compile(r'<\/strong><br>([^<]*)'), |
76 re.compile(r'<\/strong><br>([^<]*)'), |
70 re.compile(r'<Div style=padding:10px;[^>]*>([^<]*)') |
77 re.compile(r'<[Dd]iv style=padding:10px;[^>]*>([^<]*)') |
71 ] |
78 ] |
72 |
79 |
73 __languages = { 'english' : 'en', |
80 __languages = { 'english' : 'en', |
74 'french' : 'fr', |
81 'french' : 'fr', |
75 'spanish' : 'es', |
82 'spanish' : 'es', |
111 pass |
118 pass |
112 |
119 |
113 def clean(text): |
120 def clean(text): |
114 return ' '.join(string.replace(text.strip(), "\n", ' ').split()) |
121 return ' '.join(string.replace(text.strip(), "\n", ' ').split()) |
115 |
122 |
116 def translate(phrase, from_lang, to_lang): |
123 def translate(phrase, from_lang, to_lang, utf=1): |
117 phrase = clean(phrase) |
124 phrase = clean(phrase) |
118 try: |
125 try: |
119 from_code = __languages[from_lang.lower()] |
126 from_code = __languages[from_lang.lower()] |
120 to_code = __languages[to_lang.lower()] |
127 to_code = __languages[to_lang.lower()] |
121 except KeyError, lang: |
128 except KeyError, lang: |
122 raise LanguageNotAvailableError(lang) |
129 raise LanguageNotAvailableError(lang) |
123 |
130 |
124 params = urllib.urlencode( { 'doit' : 'done', |
131 if utf: |
|
132 phrase.encode('utf-8', 'replace') |
|
133 |
|
134 params = unicode_urlencode( { 'doit' : 'done', |
125 'tt' : 'urltext', |
135 'tt' : 'urltext', |
126 'intl' : '1', |
136 'intl' : '1', |
127 'urltext' : phrase.encode('utf-8', 'replace'), |
137 'urltext' : phrase, |
128 'lp' : from_code + '_' + to_code } ) |
138 'lp' : from_code + '_' + to_code } ) |
129 try: |
139 try: |
130 response = urllib.urlopen('http://babelfish.altavista.com/babelfish/tr', params) |
140 babel = urllib.FancyURLopener() |
|
141 babel.addheader('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7') |
|
142 response = babel.open('http://babelfish.altavista.com/tr', params) |
131 except IOError, what: |
143 except IOError, what: |
132 raise BabelizerIOError("Couldn't talk to server: %s" % what) |
144 raise BabelizerIOError("Couldn't talk to server: %s" % what) |
133 except: |
145 except: |
134 print "Unexpected error:", sys.exc_info()[0] |
146 print "Unexpected error:", sys.exc_info()[0] |
135 |
147 |
140 if not match: raise BabelfishChangedError("Can't recognize translated string.") |
152 if not match: raise BabelfishChangedError("Can't recognize translated string.") |
141 return clean(match.group(1)) |
153 return clean(match.group(1)) |
142 |
154 |
143 def babelize(phrase, from_language, through_language, limit = 12, callback = None): |
155 def babelize(phrase, from_language, through_language, limit = 12, callback = None): |
144 phrase = clean(phrase) |
156 phrase = clean(phrase) |
|
157 phrase.encode('utf-8', 'replace') |
145 seen = { phrase: 1 } |
158 seen = { phrase: 1 } |
146 if callback: |
159 if callback: |
147 callback(phrase) |
160 callback(phrase) |
148 else: |
161 else: |
149 results = [ phrase ] |
162 results = [ phrase ] |
150 flip = { from_language: through_language, through_language: from_language } |
163 flip = { from_language: through_language, through_language: from_language } |
151 next = from_language |
164 next = from_language |
152 for i in range(limit): |
165 for i in range(limit): |
153 phrase = translate(phrase, next, flip[next]) |
166 phrase = translate(phrase, next, flip[next], None) |
154 if seen.has_key(phrase): break |
167 if seen.has_key(phrase): break |
155 seen[phrase] = 1 |
168 seen[phrase] = 1 |
156 if callback: |
169 if callback: |
157 callback(phrase) |
170 callback(phrase) |
158 else: |
171 else: |