modules/google.py
changeset 0 93b25987d3e5
child 18 3a35dd9adc73
equal deleted inserted replaced
-1:000000000000 0:93b25987d3e5
       
     1 """Python wrapper for Google web APIs
       
     2 
       
     3 This module allows you to access Google's web APIs through SOAP,
       
     4 to do things like search Google and get the results programmatically.
       
     5 Described here:
       
     6   http://www.google.com/apis/
       
     7   
       
     8 You need a Google-provided license key to use these services.
       
     9 Follow the link above to get one.  These functions will look in
       
    10 several places (in this order) for the license key:
       
    11 - the "license_key" argument of each function
       
    12 - the module-level LICENSE_KEY variable (call setLicense once to set it)
       
    13 - an environment variable called GOOGLE_LICENSE_KEY
       
    14 - a file called ".googlekey" in the current directory
       
    15 - a file called "googlekey.txt" in the current directory
       
    16 - a file called ".googlekey" in your home directory
       
    17 - a file called "googlekey.txt" in your home directory
       
    18 - a file called ".googlekey" in the same directory as google.py
       
    19 - a file called "googlekey.txt" in the same directory as google.py
       
    20 
       
    21 Sample usage:
       
    22 >>> import google
       
    23 >>> google.setLicense('...') # must get your own key!
       
    24 >>> data = google.doGoogleSearch('python')
       
    25 >>> data.meta.searchTime
       
    26 0.043221000000000002
       
    27 >>> data.results[0].URL
       
    28 'http://www.python.org/'
       
    29 >>> data.results[0].title
       
    30 '<b>Python</b> Language Website'
       
    31 
       
    32 See documentation of SearchResultsMetaData and SearchResult classes
       
    33 for other available attributes.
       
    34 """
       
    35 
       
    36 __author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
       
    37 __version__ = "0.5.2"
       
    38 __cvsversion__ = "$Revision: 1.1.1.1 $"[11:-2]
       
    39 __date__ = "$Date: 2005/09/29 21:38:49 $"[7:-2]
       
    40 __copyright__ = "Copyright (c) 2002 Mark Pilgrim"
       
    41 __license__ = "Python"
       
    42 __credits__ = """David Ascher, for the install script
       
    43 Erik Max Francis, for the command line interface
       
    44 Michael Twomey, for HTTP proxy support"""
       
    45 
       
    46 import SOAP
       
    47 import os, sys, getopt
       
    48 
       
    49 LICENSE_KEY = None
       
    50 HTTP_PROXY = None
       
    51 
       
    52 # don't touch the rest of these constants
       
    53 class NoLicenseKey(Exception): pass
       
    54 _url = 'http://api.google.com/search/beta2'
       
    55 _namespace = 'urn:GoogleSearch'
       
    56 _false = SOAP.booleanType(0)
       
    57 _true = SOAP.booleanType(1)
       
    58 _googlefile1 = ".googlekey"
       
    59 _googlefile2 = "googlekey.txt"
       
    60 _licenseLocations = (
       
    61     (lambda key: key, 'passed to the function in license_key variable'),
       
    62     (lambda key: LICENSE_KEY, 'module-level LICENSE_KEY variable (call setLicense to set it)'),
       
    63     (lambda key: os.environ.get('GOOGLE_LICENSE_KEY', None), 'an environment variable called GOOGLE_LICENSE_KEY'),
       
    64     (lambda key: _contentsOf(os.getcwd(), _googlefile1), '%s in the current directory' % _googlefile1),
       
    65     (lambda key: _contentsOf(os.getcwd(), _googlefile2), '%s in the current directory' % _googlefile2),
       
    66     (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile1), '%s in your home directory' % _googlefile1),
       
    67     (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile2), '%s in your home directory' % _googlefile2),
       
    68     (lambda key: _contentsOf(_getScriptDir(), _googlefile1), '%s in the google.py directory' % _googlefile1),
       
    69     (lambda key: _contentsOf(_getScriptDir(), _googlefile2), '%s in the google.py directory' % _googlefile2)
       
    70     )
       
    71 
       
    72 ## administrative functions
       
    73 def version():
       
    74     print """PyGoogle %(__version__)s
       
    75 %(__copyright__)s
       
    76 released %(__date__)s
       
    77 
       
    78 Thanks to:
       
    79 %(__credits__)s""" % globals()
       
    80     
       
    81 def usage():
       
    82     program = os.path.basename(sys.argv[0])
       
    83     print """Usage: %(program)s [options] [querytype] query
       
    84 
       
    85 options:
       
    86   -k, --key= <license key> Google license key (see important note below)
       
    87   -1, -l, --lucky          show only first hit
       
    88   -m, --meta               show meta information
       
    89   -r, --reverse            show results in reverse order
       
    90   -x, --proxy= <url>       use HTTP proxy
       
    91   -h, --help               print this help
       
    92   -v, --version            print version and copyright information
       
    93   -t, --test               run test queries
       
    94 
       
    95 querytype:
       
    96   -s, --search= <query>    search (default)
       
    97   -c, --cache= <url>       retrieve cached page
       
    98   -p, --spelling= <word>   check spelling
       
    99 
       
   100 IMPORTANT NOTE: all Google functions require a valid license key;
       
   101 visit http://www.google.com/apis/ to get one.  %(program)s will look in
       
   102 these places (in order) and use the first license key it finds:
       
   103   * the key specified on the command line""" % vars()
       
   104     for get, location in _licenseLocations[2:]:
       
   105         print "  *", location
       
   106 
       
   107 ## utility functions
       
   108 def setLicense(license_key):
       
   109     """set license key"""
       
   110     global LICENSE_KEY
       
   111     LICENSE_KEY = license_key
       
   112     
       
   113 def getLicense(license_key = None):
       
   114     """get license key
       
   115 
       
   116     license key can come from any number of locations;
       
   117     see module docs for search order"""
       
   118     for get, location in _licenseLocations:
       
   119         rc = get(license_key)
       
   120         if rc: return rc
       
   121     usage()
       
   122     raise NoLicenseKey, 'get a license key at http://www.google.com/apis/'
       
   123 
       
   124 def setProxy(http_proxy):
       
   125     """set HTTP proxy"""
       
   126     global HTTP_PROXY
       
   127     HTTP_PROXY = http_proxy
       
   128 
       
   129 def getProxy(http_proxy = None):
       
   130     """get HTTP proxy"""
       
   131     return http_proxy or HTTP_PROXY
       
   132 
       
   133 def _contentsOf(dirname, filename):
       
   134     filename = os.path.join(dirname, filename)
       
   135     if not os.path.exists(filename): return None
       
   136     fsock = open(filename)
       
   137     contents = fsock.read()
       
   138     fsock.close()
       
   139     return contents
       
   140 
       
   141 def _getScriptDir():
       
   142     if __name__ == '__main__':
       
   143         return os.path.abspath(os.path.dirname(sys.argv[0]))
       
   144     else:
       
   145         return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__))
       
   146 
       
   147 def _marshalBoolean(value):
       
   148     if value:
       
   149         return _true
       
   150     else:
       
   151         return _false
       
   152 
       
   153 ## output formatters
       
   154 def makeFormatter(outputFormat):
       
   155     classname = "%sOutputFormatter" % outputFormat.capitalize()
       
   156     return globals()[classname]()
       
   157 
       
   158 def output(results, params):
       
   159     formatter = makeFormatter(params.get("outputFormat", "text"))
       
   160     outputmethod = getattr(formatter, params["func"])
       
   161     outputmethod(results, params)
       
   162 
       
   163 class OutputFormatter:
       
   164     def boil(self, data):
       
   165         if type(data) == type(u""):
       
   166             return data.encode("ISO-8859-1", "replace")
       
   167         else:
       
   168             return data
       
   169 
       
   170 class TextOutputFormatter(OutputFormatter):
       
   171     def common(self, data, params):
       
   172         if params.get("showMeta", 0):
       
   173             meta = data.meta
       
   174             for category in meta.directoryCategories:
       
   175                 print "directoryCategory: %s" % self.boil(category["fullViewableName"])
       
   176             for attr in [node for node in dir(meta) if node <> "directoryCategories" and node[:2] <> '__']:
       
   177                 print "%s:" % attr, self.boil(getattr(meta, attr))
       
   178         
       
   179     def doGoogleSearch(self, data, params):
       
   180         results = data.results
       
   181         if params.get("feelingLucky", 0):
       
   182             results = results[:1]
       
   183         if params.get("reverseOrder", 0):
       
   184             results.reverse()
       
   185         for result in results:
       
   186             for attr in dir(result):
       
   187                 if attr == "directoryCategory":
       
   188                     print "directoryCategory:", self.boil(result.directoryCategory["fullViewableName"])
       
   189                 elif attr[:2] <> '__':
       
   190                     print "%s:" % attr, self.boil(getattr(result, attr))
       
   191             print
       
   192         self.common(data, params)
       
   193     
       
   194     def doGetCachedPage(self, data, params):
       
   195         print data
       
   196         self.common(data, params)
       
   197 
       
   198     doSpellingSuggestion = doGetCachedPage
       
   199 
       
   200 ## search results classes
       
   201 class _SearchBase:
       
   202     def __init__(self, params):
       
   203         for k, v in params.items():
       
   204             if isinstance(v, SOAP.structType):
       
   205                 v = v._asdict
       
   206             try:
       
   207                 if isinstance(v[0], SOAP.structType):
       
   208                     v = [node._asdict for node in v]
       
   209             except:
       
   210                 pass
       
   211             self.__dict__[str(k)] = v
       
   212 
       
   213 class SearchResultsMetaData(_SearchBase):
       
   214     """metadata of search query results
       
   215 
       
   216     Available attributes:
       
   217     documentFiltering - flag indicates whether duplicate page filtering was perfomed in this search
       
   218     searchComments - human-readable informational message (example: "'the' is a very common word
       
   219         and was not included in your search")
       
   220     estimatedTotalResultsCount - estimated total number of results for this query
       
   221     estimateIsExact - flag indicates whether estimatedTotalResultsCount is an exact value
       
   222     searchQuery - search string that initiated this search
       
   223     startIndex - index of first result returned (zero-based)
       
   224     endIndex - index of last result returned (zero-based)
       
   225     searchTips - human-readable informational message on how to use Google bette
       
   226     directoryCategories - list of dictionaries like this:
       
   227         {'fullViewableName': Open Directory category,
       
   228          'specialEncoding': encoding scheme of this directory category}
       
   229     searchTime - total search time, in seconds
       
   230     """    
       
   231     pass
       
   232 
       
   233 class SearchResult(_SearchBase):
       
   234     """search result
       
   235 
       
   236     Available attributes:
       
   237     URL - URL
       
   238     title - title (HTML)
       
   239     snippet - snippet showing query context (HTML)
       
   240     cachedSize - size of cached version of this result, (KB)
       
   241     relatedInformationPresent - flag indicates that the "related:" keyword is supported for this URL
       
   242     hostName: When filtering occurs, a maximum of two results from any given host is returned.
       
   243         When this occurs, the second resultElement that comes from that host contains
       
   244         the host name in this parameter.
       
   245     directoryCategory: dictionary like this:
       
   246         {'fullViewableName': Open Directory category,
       
   247          'specialEncoding': encoding scheme of this directory category}
       
   248     directoryTitle: Open Directory title of this result (or blank)
       
   249     summary - Open Directory summary for this result (or blank)
       
   250     """
       
   251     pass
       
   252 
       
   253 class SearchReturnValue:
       
   254     """complete search results for a single query
       
   255 
       
   256     Available attributes:
       
   257     meta - SearchResultsMetaData
       
   258     results - list of SearchResult
       
   259     """
       
   260     def __init__(self, metadata, results):
       
   261         self.meta = metadata
       
   262         self.results = results
       
   263 
       
   264 ## main functions
       
   265 def doGoogleSearch(q, start=0, maxResults=10, filter=1, restrict='',
       
   266                    safeSearch=0, language='', inputencoding='', outputencoding='',
       
   267                    license_key = None, http_proxy = None):
       
   268     """search Google
       
   269 
       
   270     You need a license key to call this function; see
       
   271     http://www.google.com/apis/ to get one.  Then you can either pass it to
       
   272     this function every time, or set it globally; see the module docs for details.
       
   273     
       
   274     Parameters:
       
   275     q - search string.  Anything you could type at google.com, you can pass here.
       
   276         See http://www.google.com/help/features.html for examples of advanced features.
       
   277     start (optional) - zero-based index of first desired result (for paging through
       
   278         multiple pages of results)
       
   279     maxResults (optional) - maximum number of results, currently capped at 10
       
   280     filter (optional) - set to 1 to filter out similar results, set to 0 to see everything
       
   281     restrict (optional) - restrict results by country or topic.  Examples:
       
   282         Ukraine - search only sites located in Ukraine
       
   283         linux - search Linux sites only
       
   284         mac - search Mac sites only
       
   285         bsd - search FreeBSD sites only
       
   286         See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
       
   287         for more advanced examples and a full list of country codes and topics.
       
   288     safeSearch (optional) - set to 1 to filter results with SafeSearch (no adult material)
       
   289     language (optional) - restricts search to documents in one or more languages.  Example:
       
   290         lang_en - only return pages in English
       
   291         lang_fr - only return pages in French
       
   292         See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
       
   293         for more advanced examples and a full list of language codes.
       
   294     inputencoding (optional) - sets the character encoding of q parameter
       
   295     outputencoding (optional) - sets the character encoding of the returned results
       
   296         See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
       
   297         for a full list of encodings.
       
   298     http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages
       
   299 
       
   300     Returns: SearchReturnValue
       
   301     .meta - SearchMetaData
       
   302     .results - list of SearchResult
       
   303     See documentation of these individual classes for list of available attributes
       
   304     """
       
   305     http_proxy = getProxy(http_proxy)
       
   306     remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
       
   307     license_key = getLicense(license_key)
       
   308     filter = _marshalBoolean(filter)
       
   309     safeSearch = _marshalBoolean(safeSearch)
       
   310     data = remoteserver.doGoogleSearch(license_key, q, start, maxResults, filter, restrict,
       
   311                                        safeSearch, language, inputencoding, outputencoding)
       
   312     metadata = data._asdict
       
   313     del metadata["resultElements"]
       
   314     metadata = SearchResultsMetaData(metadata)
       
   315     results = [SearchResult(node._asdict) for node in data.resultElements]
       
   316     return SearchReturnValue(metadata, results)
       
   317 
       
   318 def doGetCachedPage(url, license_key = None, http_proxy = None):
       
   319     """get page from Google cache
       
   320 
       
   321     You need a license key to call this function; see
       
   322     http://www.google.com/apis/ to get one.  Then you can either pass it to
       
   323     this function every time, or set it globally; see the module docs for details.
       
   324     
       
   325     Parameters:
       
   326     url - address of page to get
       
   327     license_key (optional) - Google license key
       
   328     http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages
       
   329 
       
   330     Returns: string, text of cached page    
       
   331     """
       
   332     http_proxy = getProxy(http_proxy)
       
   333     remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
       
   334     license_key = getLicense(license_key)
       
   335     return remoteserver.doGetCachedPage(license_key, url)
       
   336 
       
   337 def doSpellingSuggestion(phrase, license_key = None, http_proxy = None):
       
   338     """get spelling suggestions from Google
       
   339 
       
   340     You need a license key to call this function; see
       
   341     http://www.google.com/apis/ to get one.  Then you can either pass it to
       
   342     this function every time, or set it globally; see the module docs for details.
       
   343     
       
   344     Parameters:
       
   345     phrase - word or phrase to spell-check
       
   346     http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages
       
   347 
       
   348     Returns: text of suggested replacement, or None
       
   349     """
       
   350     http_proxy = getProxy(http_proxy)
       
   351     remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
       
   352     license_key = getLicense(license_key)
       
   353     return remoteserver.doSpellingSuggestion(license_key, phrase)
       
   354 
       
   355 ## functional test suite (see googletest.py for unit test suite)
       
   356 def test():
       
   357     try:
       
   358         getLicense(None)
       
   359     except NoLicenseKey:
       
   360         return
       
   361     print "Searching for Python at google.com..."
       
   362     data = doGoogleSearch("Python")
       
   363     output(data, {"func": "doGoogleSearch"})
       
   364 
       
   365     print "\nSearching for 5 _French_ pages about Python, encoded in ISO-8859-1..."
       
   366     data = doGoogleSearch("Python", language='lang_fr', outputencoding='ISO-8859-1', maxResults=5)
       
   367     output(data, {"func": "doGoogleSearch"})
       
   368 
       
   369     phrase = "Pyhton programming languager"
       
   370     print "\nTesting spelling suggetions for '%s'..." % phrase
       
   371     data = doSpellingSuggestion(phrase)
       
   372     output(data, {"func": "doSpellingSuggestion"})
       
   373 
       
   374 ## main driver for command-line use
       
   375 def main(argv):
       
   376     if not argv:
       
   377         usage()
       
   378         return
       
   379     q = None
       
   380     func = None
       
   381     http_proxy = None
       
   382     license_key = None
       
   383     feelingLucky = 0
       
   384     showMeta = 0
       
   385     reverseOrder = 0
       
   386     runTest = 0
       
   387     outputFormat = "text"
       
   388     try:
       
   389         opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1",
       
   390             ["search=", "cache=", "spelling=", "key=", "lucky", "meta", "reverse", "proxy=", "help", "version", "test"])
       
   391     except getopt.GetoptError:
       
   392         usage()
       
   393         sys.exit(2)
       
   394     for opt, arg in opts:
       
   395         if opt in ("-s", "--search"):
       
   396             q = arg
       
   397             func = "doGoogleSearch"
       
   398         elif opt in ("-c", "--cache"):
       
   399             q = arg
       
   400             func = "doGetCachedPage"
       
   401         elif opt in ("-p", "--spelling"):
       
   402             q = arg
       
   403             func = "doSpellingSuggestion"
       
   404         elif opt in ("-k", "--key"):
       
   405             license_key = arg
       
   406         elif opt in ("-l", "-1", "--lucky"):
       
   407             feelingLucky = 1
       
   408         elif opt in ("-m", "--meta"):
       
   409             showMeta = 1
       
   410         elif opt in ("-r", "--reverse"):
       
   411             reverseOrder = 1
       
   412         elif opt in ("-x", "--proxy"):
       
   413             http_proxy = arg
       
   414         elif opt in ("-h", "--help"):
       
   415             usage()
       
   416         elif opt in ("-v", "--version"):
       
   417             version()
       
   418         elif opt in ("-t", "--test"):
       
   419             runTest = 1
       
   420     if runTest:
       
   421         setLicense(license_key)
       
   422         setProxy(http_proxy)
       
   423         test()
       
   424     if args and not q:
       
   425         q = args[0]
       
   426         func = "doGoogleSearch"
       
   427     if func:
       
   428         results = globals()[func](q, http_proxy=http_proxy, license_key=license_key)
       
   429         output(results, locals())
       
   430 
       
   431 if __name__ == '__main__':
       
   432     main(sys.argv[1:])