|
1 """Python wrapper for Google web APIs |
|
2 |
|
3 This module allows you to access Google's web APIs through SOAP, |
|
4 to do things like search Google and get the results programmatically. |
|
5 Described here: |
|
6 http://www.google.com/apis/ |
|
7 |
|
8 You need a Google-provided license key to use these services. |
|
9 Follow the link above to get one. These functions will look in |
|
10 several places (in this order) for the license key: |
|
11 - the "license_key" argument of each function |
|
12 - the module-level LICENSE_KEY variable (call setLicense once to set it) |
|
13 - an environment variable called GOOGLE_LICENSE_KEY |
|
14 - a file called ".googlekey" in the current directory |
|
15 - a file called "googlekey.txt" in the current directory |
|
16 - a file called ".googlekey" in your home directory |
|
17 - a file called "googlekey.txt" in your home directory |
|
18 - a file called ".googlekey" in the same directory as google.py |
|
19 - a file called "googlekey.txt" in the same directory as google.py |
|
20 |
|
21 Sample usage: |
|
22 >>> import google |
|
23 >>> google.setLicense('...') # must get your own key! |
|
24 >>> data = google.doGoogleSearch('python') |
|
25 >>> data.meta.searchTime |
|
26 0.043221000000000002 |
|
27 >>> data.results[0].URL |
|
28 'http://www.python.org/' |
|
29 >>> data.results[0].title |
|
30 '<b>Python</b> Language Website' |
|
31 |
|
32 See documentation of SearchResultsMetaData and SearchResult classes |
|
33 for other available attributes. |
|
34 """ |
|
35 |
|
36 __author__ = "Mark Pilgrim (f8dy@diveintomark.org)" |
|
37 __version__ = "0.5.2" |
|
38 __cvsversion__ = "$Revision: 1.1.1.1 $"[11:-2] |
|
39 __date__ = "$Date: 2005/09/29 21:38:49 $"[7:-2] |
|
40 __copyright__ = "Copyright (c) 2002 Mark Pilgrim" |
|
41 __license__ = "Python" |
|
42 __credits__ = """David Ascher, for the install script |
|
43 Erik Max Francis, for the command line interface |
|
44 Michael Twomey, for HTTP proxy support""" |
|
45 |
|
46 import SOAP |
|
47 import os, sys, getopt |
|
48 |
|
49 LICENSE_KEY = None |
|
50 HTTP_PROXY = None |
|
51 |
|
52 # don't touch the rest of these constants |
|
53 class NoLicenseKey(Exception): pass |
|
54 _url = 'http://api.google.com/search/beta2' |
|
55 _namespace = 'urn:GoogleSearch' |
|
56 _false = SOAP.booleanType(0) |
|
57 _true = SOAP.booleanType(1) |
|
58 _googlefile1 = ".googlekey" |
|
59 _googlefile2 = "googlekey.txt" |
|
60 _licenseLocations = ( |
|
61 (lambda key: key, 'passed to the function in license_key variable'), |
|
62 (lambda key: LICENSE_KEY, 'module-level LICENSE_KEY variable (call setLicense to set it)'), |
|
63 (lambda key: os.environ.get('GOOGLE_LICENSE_KEY', None), 'an environment variable called GOOGLE_LICENSE_KEY'), |
|
64 (lambda key: _contentsOf(os.getcwd(), _googlefile1), '%s in the current directory' % _googlefile1), |
|
65 (lambda key: _contentsOf(os.getcwd(), _googlefile2), '%s in the current directory' % _googlefile2), |
|
66 (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile1), '%s in your home directory' % _googlefile1), |
|
67 (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile2), '%s in your home directory' % _googlefile2), |
|
68 (lambda key: _contentsOf(_getScriptDir(), _googlefile1), '%s in the google.py directory' % _googlefile1), |
|
69 (lambda key: _contentsOf(_getScriptDir(), _googlefile2), '%s in the google.py directory' % _googlefile2) |
|
70 ) |
|
71 |
|
72 ## administrative functions |
|
73 def version(): |
|
74 print """PyGoogle %(__version__)s |
|
75 %(__copyright__)s |
|
76 released %(__date__)s |
|
77 |
|
78 Thanks to: |
|
79 %(__credits__)s""" % globals() |
|
80 |
|
81 def usage(): |
|
82 program = os.path.basename(sys.argv[0]) |
|
83 print """Usage: %(program)s [options] [querytype] query |
|
84 |
|
85 options: |
|
86 -k, --key= <license key> Google license key (see important note below) |
|
87 -1, -l, --lucky show only first hit |
|
88 -m, --meta show meta information |
|
89 -r, --reverse show results in reverse order |
|
90 -x, --proxy= <url> use HTTP proxy |
|
91 -h, --help print this help |
|
92 -v, --version print version and copyright information |
|
93 -t, --test run test queries |
|
94 |
|
95 querytype: |
|
96 -s, --search= <query> search (default) |
|
97 -c, --cache= <url> retrieve cached page |
|
98 -p, --spelling= <word> check spelling |
|
99 |
|
100 IMPORTANT NOTE: all Google functions require a valid license key; |
|
101 visit http://www.google.com/apis/ to get one. %(program)s will look in |
|
102 these places (in order) and use the first license key it finds: |
|
103 * the key specified on the command line""" % vars() |
|
104 for get, location in _licenseLocations[2:]: |
|
105 print " *", location |
|
106 |
|
107 ## utility functions |
|
108 def setLicense(license_key): |
|
109 """set license key""" |
|
110 global LICENSE_KEY |
|
111 LICENSE_KEY = license_key |
|
112 |
|
113 def getLicense(license_key = None): |
|
114 """get license key |
|
115 |
|
116 license key can come from any number of locations; |
|
117 see module docs for search order""" |
|
118 for get, location in _licenseLocations: |
|
119 rc = get(license_key) |
|
120 if rc: return rc |
|
121 usage() |
|
122 raise NoLicenseKey, 'get a license key at http://www.google.com/apis/' |
|
123 |
|
124 def setProxy(http_proxy): |
|
125 """set HTTP proxy""" |
|
126 global HTTP_PROXY |
|
127 HTTP_PROXY = http_proxy |
|
128 |
|
129 def getProxy(http_proxy = None): |
|
130 """get HTTP proxy""" |
|
131 return http_proxy or HTTP_PROXY |
|
132 |
|
133 def _contentsOf(dirname, filename): |
|
134 filename = os.path.join(dirname, filename) |
|
135 if not os.path.exists(filename): return None |
|
136 fsock = open(filename) |
|
137 contents = fsock.read() |
|
138 fsock.close() |
|
139 return contents |
|
140 |
|
141 def _getScriptDir(): |
|
142 if __name__ == '__main__': |
|
143 return os.path.abspath(os.path.dirname(sys.argv[0])) |
|
144 else: |
|
145 return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__)) |
|
146 |
|
147 def _marshalBoolean(value): |
|
148 if value: |
|
149 return _true |
|
150 else: |
|
151 return _false |
|
152 |
|
153 ## output formatters |
|
154 def makeFormatter(outputFormat): |
|
155 classname = "%sOutputFormatter" % outputFormat.capitalize() |
|
156 return globals()[classname]() |
|
157 |
|
158 def output(results, params): |
|
159 formatter = makeFormatter(params.get("outputFormat", "text")) |
|
160 outputmethod = getattr(formatter, params["func"]) |
|
161 outputmethod(results, params) |
|
162 |
|
163 class OutputFormatter: |
|
164 def boil(self, data): |
|
165 if type(data) == type(u""): |
|
166 return data.encode("ISO-8859-1", "replace") |
|
167 else: |
|
168 return data |
|
169 |
|
170 class TextOutputFormatter(OutputFormatter): |
|
171 def common(self, data, params): |
|
172 if params.get("showMeta", 0): |
|
173 meta = data.meta |
|
174 for category in meta.directoryCategories: |
|
175 print "directoryCategory: %s" % self.boil(category["fullViewableName"]) |
|
176 for attr in [node for node in dir(meta) if node <> "directoryCategories" and node[:2] <> '__']: |
|
177 print "%s:" % attr, self.boil(getattr(meta, attr)) |
|
178 |
|
179 def doGoogleSearch(self, data, params): |
|
180 results = data.results |
|
181 if params.get("feelingLucky", 0): |
|
182 results = results[:1] |
|
183 if params.get("reverseOrder", 0): |
|
184 results.reverse() |
|
185 for result in results: |
|
186 for attr in dir(result): |
|
187 if attr == "directoryCategory": |
|
188 print "directoryCategory:", self.boil(result.directoryCategory["fullViewableName"]) |
|
189 elif attr[:2] <> '__': |
|
190 print "%s:" % attr, self.boil(getattr(result, attr)) |
|
191 print |
|
192 self.common(data, params) |
|
193 |
|
194 def doGetCachedPage(self, data, params): |
|
195 print data |
|
196 self.common(data, params) |
|
197 |
|
198 doSpellingSuggestion = doGetCachedPage |
|
199 |
|
200 ## search results classes |
|
201 class _SearchBase: |
|
202 def __init__(self, params): |
|
203 for k, v in params.items(): |
|
204 if isinstance(v, SOAP.structType): |
|
205 v = v._asdict |
|
206 try: |
|
207 if isinstance(v[0], SOAP.structType): |
|
208 v = [node._asdict for node in v] |
|
209 except: |
|
210 pass |
|
211 self.__dict__[str(k)] = v |
|
212 |
|
213 class SearchResultsMetaData(_SearchBase): |
|
214 """metadata of search query results |
|
215 |
|
216 Available attributes: |
|
217 documentFiltering - flag indicates whether duplicate page filtering was perfomed in this search |
|
218 searchComments - human-readable informational message (example: "'the' is a very common word |
|
219 and was not included in your search") |
|
220 estimatedTotalResultsCount - estimated total number of results for this query |
|
221 estimateIsExact - flag indicates whether estimatedTotalResultsCount is an exact value |
|
222 searchQuery - search string that initiated this search |
|
223 startIndex - index of first result returned (zero-based) |
|
224 endIndex - index of last result returned (zero-based) |
|
225 searchTips - human-readable informational message on how to use Google bette |
|
226 directoryCategories - list of dictionaries like this: |
|
227 {'fullViewableName': Open Directory category, |
|
228 'specialEncoding': encoding scheme of this directory category} |
|
229 searchTime - total search time, in seconds |
|
230 """ |
|
231 pass |
|
232 |
|
233 class SearchResult(_SearchBase): |
|
234 """search result |
|
235 |
|
236 Available attributes: |
|
237 URL - URL |
|
238 title - title (HTML) |
|
239 snippet - snippet showing query context (HTML) |
|
240 cachedSize - size of cached version of this result, (KB) |
|
241 relatedInformationPresent - flag indicates that the "related:" keyword is supported for this URL |
|
242 hostName: When filtering occurs, a maximum of two results from any given host is returned. |
|
243 When this occurs, the second resultElement that comes from that host contains |
|
244 the host name in this parameter. |
|
245 directoryCategory: dictionary like this: |
|
246 {'fullViewableName': Open Directory category, |
|
247 'specialEncoding': encoding scheme of this directory category} |
|
248 directoryTitle: Open Directory title of this result (or blank) |
|
249 summary - Open Directory summary for this result (or blank) |
|
250 """ |
|
251 pass |
|
252 |
|
253 class SearchReturnValue: |
|
254 """complete search results for a single query |
|
255 |
|
256 Available attributes: |
|
257 meta - SearchResultsMetaData |
|
258 results - list of SearchResult |
|
259 """ |
|
260 def __init__(self, metadata, results): |
|
261 self.meta = metadata |
|
262 self.results = results |
|
263 |
|
264 ## main functions |
|
265 def doGoogleSearch(q, start=0, maxResults=10, filter=1, restrict='', |
|
266 safeSearch=0, language='', inputencoding='', outputencoding='', |
|
267 license_key = None, http_proxy = None): |
|
268 """search Google |
|
269 |
|
270 You need a license key to call this function; see |
|
271 http://www.google.com/apis/ to get one. Then you can either pass it to |
|
272 this function every time, or set it globally; see the module docs for details. |
|
273 |
|
274 Parameters: |
|
275 q - search string. Anything you could type at google.com, you can pass here. |
|
276 See http://www.google.com/help/features.html for examples of advanced features. |
|
277 start (optional) - zero-based index of first desired result (for paging through |
|
278 multiple pages of results) |
|
279 maxResults (optional) - maximum number of results, currently capped at 10 |
|
280 filter (optional) - set to 1 to filter out similar results, set to 0 to see everything |
|
281 restrict (optional) - restrict results by country or topic. Examples: |
|
282 Ukraine - search only sites located in Ukraine |
|
283 linux - search Linux sites only |
|
284 mac - search Mac sites only |
|
285 bsd - search FreeBSD sites only |
|
286 See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html) |
|
287 for more advanced examples and a full list of country codes and topics. |
|
288 safeSearch (optional) - set to 1 to filter results with SafeSearch (no adult material) |
|
289 language (optional) - restricts search to documents in one or more languages. Example: |
|
290 lang_en - only return pages in English |
|
291 lang_fr - only return pages in French |
|
292 See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html) |
|
293 for more advanced examples and a full list of language codes. |
|
294 inputencoding (optional) - sets the character encoding of q parameter |
|
295 outputencoding (optional) - sets the character encoding of the returned results |
|
296 See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html) |
|
297 for a full list of encodings. |
|
298 http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages |
|
299 |
|
300 Returns: SearchReturnValue |
|
301 .meta - SearchMetaData |
|
302 .results - list of SearchResult |
|
303 See documentation of these individual classes for list of available attributes |
|
304 """ |
|
305 http_proxy = getProxy(http_proxy) |
|
306 remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy) |
|
307 license_key = getLicense(license_key) |
|
308 filter = _marshalBoolean(filter) |
|
309 safeSearch = _marshalBoolean(safeSearch) |
|
310 data = remoteserver.doGoogleSearch(license_key, q, start, maxResults, filter, restrict, |
|
311 safeSearch, language, inputencoding, outputencoding) |
|
312 metadata = data._asdict |
|
313 del metadata["resultElements"] |
|
314 metadata = SearchResultsMetaData(metadata) |
|
315 results = [SearchResult(node._asdict) for node in data.resultElements] |
|
316 return SearchReturnValue(metadata, results) |
|
317 |
|
318 def doGetCachedPage(url, license_key = None, http_proxy = None): |
|
319 """get page from Google cache |
|
320 |
|
321 You need a license key to call this function; see |
|
322 http://www.google.com/apis/ to get one. Then you can either pass it to |
|
323 this function every time, or set it globally; see the module docs for details. |
|
324 |
|
325 Parameters: |
|
326 url - address of page to get |
|
327 license_key (optional) - Google license key |
|
328 http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages |
|
329 |
|
330 Returns: string, text of cached page |
|
331 """ |
|
332 http_proxy = getProxy(http_proxy) |
|
333 remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy) |
|
334 license_key = getLicense(license_key) |
|
335 return remoteserver.doGetCachedPage(license_key, url) |
|
336 |
|
337 def doSpellingSuggestion(phrase, license_key = None, http_proxy = None): |
|
338 """get spelling suggestions from Google |
|
339 |
|
340 You need a license key to call this function; see |
|
341 http://www.google.com/apis/ to get one. Then you can either pass it to |
|
342 this function every time, or set it globally; see the module docs for details. |
|
343 |
|
344 Parameters: |
|
345 phrase - word or phrase to spell-check |
|
346 http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages |
|
347 |
|
348 Returns: text of suggested replacement, or None |
|
349 """ |
|
350 http_proxy = getProxy(http_proxy) |
|
351 remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy) |
|
352 license_key = getLicense(license_key) |
|
353 return remoteserver.doSpellingSuggestion(license_key, phrase) |
|
354 |
|
355 ## functional test suite (see googletest.py for unit test suite) |
|
356 def test(): |
|
357 try: |
|
358 getLicense(None) |
|
359 except NoLicenseKey: |
|
360 return |
|
361 print "Searching for Python at google.com..." |
|
362 data = doGoogleSearch("Python") |
|
363 output(data, {"func": "doGoogleSearch"}) |
|
364 |
|
365 print "\nSearching for 5 _French_ pages about Python, encoded in ISO-8859-1..." |
|
366 data = doGoogleSearch("Python", language='lang_fr', outputencoding='ISO-8859-1', maxResults=5) |
|
367 output(data, {"func": "doGoogleSearch"}) |
|
368 |
|
369 phrase = "Pyhton programming languager" |
|
370 print "\nTesting spelling suggetions for '%s'..." % phrase |
|
371 data = doSpellingSuggestion(phrase) |
|
372 output(data, {"func": "doSpellingSuggestion"}) |
|
373 |
|
374 ## main driver for command-line use |
|
375 def main(argv): |
|
376 if not argv: |
|
377 usage() |
|
378 return |
|
379 q = None |
|
380 func = None |
|
381 http_proxy = None |
|
382 license_key = None |
|
383 feelingLucky = 0 |
|
384 showMeta = 0 |
|
385 reverseOrder = 0 |
|
386 runTest = 0 |
|
387 outputFormat = "text" |
|
388 try: |
|
389 opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1", |
|
390 ["search=", "cache=", "spelling=", "key=", "lucky", "meta", "reverse", "proxy=", "help", "version", "test"]) |
|
391 except getopt.GetoptError: |
|
392 usage() |
|
393 sys.exit(2) |
|
394 for opt, arg in opts: |
|
395 if opt in ("-s", "--search"): |
|
396 q = arg |
|
397 func = "doGoogleSearch" |
|
398 elif opt in ("-c", "--cache"): |
|
399 q = arg |
|
400 func = "doGetCachedPage" |
|
401 elif opt in ("-p", "--spelling"): |
|
402 q = arg |
|
403 func = "doSpellingSuggestion" |
|
404 elif opt in ("-k", "--key"): |
|
405 license_key = arg |
|
406 elif opt in ("-l", "-1", "--lucky"): |
|
407 feelingLucky = 1 |
|
408 elif opt in ("-m", "--meta"): |
|
409 showMeta = 1 |
|
410 elif opt in ("-r", "--reverse"): |
|
411 reverseOrder = 1 |
|
412 elif opt in ("-x", "--proxy"): |
|
413 http_proxy = arg |
|
414 elif opt in ("-h", "--help"): |
|
415 usage() |
|
416 elif opt in ("-v", "--version"): |
|
417 version() |
|
418 elif opt in ("-t", "--test"): |
|
419 runTest = 1 |
|
420 if runTest: |
|
421 setLicense(license_key) |
|
422 setProxy(http_proxy) |
|
423 test() |
|
424 if args and not q: |
|
425 q = args[0] |
|
426 func = "doGoogleSearch" |
|
427 if func: |
|
428 results = globals()[func](q, http_proxy=http_proxy, license_key=license_key) |
|
429 output(results, locals()) |
|
430 |
|
431 if __name__ == '__main__': |
|
432 main(sys.argv[1:]) |