modules/google.py
author Mikael Berthe <mikael@lilotux.net>
Tue, 01 May 2007 23:59:49 +0200
changeset 18 3a35dd9adc73
parent 0 93b25987d3e5
permissions -rw-r--r--
Fix permissions in modules/

"""Python wrapper for Google web APIs

This module allows you to access Google's web APIs through SOAP,
to do things like search Google and get the results programmatically.
Described here:
  http://www.google.com/apis/
  
You need a Google-provided license key to use these services.
Follow the link above to get one.  These functions will look in
several places (in this order) for the license key:
- the "license_key" argument of each function
- the module-level LICENSE_KEY variable (call setLicense once to set it)
- an environment variable called GOOGLE_LICENSE_KEY
- a file called ".googlekey" in the current directory
- a file called "googlekey.txt" in the current directory
- a file called ".googlekey" in your home directory
- a file called "googlekey.txt" in your home directory
- a file called ".googlekey" in the same directory as google.py
- a file called "googlekey.txt" in the same directory as google.py

Sample usage:
>>> import google
>>> google.setLicense('...') # must get your own key!
>>> data = google.doGoogleSearch('python')
>>> data.meta.searchTime
0.043221000000000002
>>> data.results[0].URL
'http://www.python.org/'
>>> data.results[0].title
'<b>Python</b> Language Website'

See documentation of SearchResultsMetaData and SearchResult classes
for other available attributes.
"""

__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
__version__ = "0.5.2"
__cvsversion__ = "$Revision: 1.1.1.1 $"[11:-2]
__date__ = "$Date: 2005/09/29 21:38:49 $"[7:-2]
__copyright__ = "Copyright (c) 2002 Mark Pilgrim"
__license__ = "Python"
__credits__ = """David Ascher, for the install script
Erik Max Francis, for the command line interface
Michael Twomey, for HTTP proxy support"""

import SOAP
import os, sys, getopt

LICENSE_KEY = None
HTTP_PROXY = None

# don't touch the rest of these constants
class NoLicenseKey(Exception): pass
_url = 'http://api.google.com/search/beta2'
_namespace = 'urn:GoogleSearch'
_false = SOAP.booleanType(0)
_true = SOAP.booleanType(1)
_googlefile1 = ".googlekey"
_googlefile2 = "googlekey.txt"
_licenseLocations = (
    (lambda key: key, 'passed to the function in license_key variable'),
    (lambda key: LICENSE_KEY, 'module-level LICENSE_KEY variable (call setLicense to set it)'),
    (lambda key: os.environ.get('GOOGLE_LICENSE_KEY', None), 'an environment variable called GOOGLE_LICENSE_KEY'),
    (lambda key: _contentsOf(os.getcwd(), _googlefile1), '%s in the current directory' % _googlefile1),
    (lambda key: _contentsOf(os.getcwd(), _googlefile2), '%s in the current directory' % _googlefile2),
    (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile1), '%s in your home directory' % _googlefile1),
    (lambda key: _contentsOf(os.environ.get('HOME', ''), _googlefile2), '%s in your home directory' % _googlefile2),
    (lambda key: _contentsOf(_getScriptDir(), _googlefile1), '%s in the google.py directory' % _googlefile1),
    (lambda key: _contentsOf(_getScriptDir(), _googlefile2), '%s in the google.py directory' % _googlefile2)
    )

## administrative functions
def version():
    print """PyGoogle %(__version__)s
%(__copyright__)s
released %(__date__)s

Thanks to:
%(__credits__)s""" % globals()
    
def usage():
    program = os.path.basename(sys.argv[0])
    print """Usage: %(program)s [options] [querytype] query

options:
  -k, --key= <license key> Google license key (see important note below)
  -1, -l, --lucky          show only first hit
  -m, --meta               show meta information
  -r, --reverse            show results in reverse order
  -x, --proxy= <url>       use HTTP proxy
  -h, --help               print this help
  -v, --version            print version and copyright information
  -t, --test               run test queries

querytype:
  -s, --search= <query>    search (default)
  -c, --cache= <url>       retrieve cached page
  -p, --spelling= <word>   check spelling

IMPORTANT NOTE: all Google functions require a valid license key;
visit http://www.google.com/apis/ to get one.  %(program)s will look in
these places (in order) and use the first license key it finds:
  * the key specified on the command line""" % vars()
    for get, location in _licenseLocations[2:]:
        print "  *", location

## utility functions
def setLicense(license_key):
    """set license key"""
    global LICENSE_KEY
    LICENSE_KEY = license_key
    
def getLicense(license_key = None):
    """get license key

    license key can come from any number of locations;
    see module docs for search order"""
    for get, location in _licenseLocations:
        rc = get(license_key)
        if rc: return rc
    usage()
    raise NoLicenseKey, 'get a license key at http://www.google.com/apis/'

def setProxy(http_proxy):
    """set HTTP proxy"""
    global HTTP_PROXY
    HTTP_PROXY = http_proxy

def getProxy(http_proxy = None):
    """get HTTP proxy"""
    return http_proxy or HTTP_PROXY

def _contentsOf(dirname, filename):
    filename = os.path.join(dirname, filename)
    if not os.path.exists(filename): return None
    fsock = open(filename)
    contents = fsock.read()
    fsock.close()
    return contents

def _getScriptDir():
    if __name__ == '__main__':
        return os.path.abspath(os.path.dirname(sys.argv[0]))
    else:
        return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__))

def _marshalBoolean(value):
    if value:
        return _true
    else:
        return _false

## output formatters
def makeFormatter(outputFormat):
    classname = "%sOutputFormatter" % outputFormat.capitalize()
    return globals()[classname]()

def output(results, params):
    formatter = makeFormatter(params.get("outputFormat", "text"))
    outputmethod = getattr(formatter, params["func"])
    outputmethod(results, params)

class OutputFormatter:
    def boil(self, data):
        if type(data) == type(u""):
            return data.encode("ISO-8859-1", "replace")
        else:
            return data

class TextOutputFormatter(OutputFormatter):
    def common(self, data, params):
        if params.get("showMeta", 0):
            meta = data.meta
            for category in meta.directoryCategories:
                print "directoryCategory: %s" % self.boil(category["fullViewableName"])
            for attr in [node for node in dir(meta) if node <> "directoryCategories" and node[:2] <> '__']:
                print "%s:" % attr, self.boil(getattr(meta, attr))
        
    def doGoogleSearch(self, data, params):
        results = data.results
        if params.get("feelingLucky", 0):
            results = results[:1]
        if params.get("reverseOrder", 0):
            results.reverse()
        for result in results:
            for attr in dir(result):
                if attr == "directoryCategory":
                    print "directoryCategory:", self.boil(result.directoryCategory["fullViewableName"])
                elif attr[:2] <> '__':
                    print "%s:" % attr, self.boil(getattr(result, attr))
            print
        self.common(data, params)
    
    def doGetCachedPage(self, data, params):
        print data
        self.common(data, params)

    doSpellingSuggestion = doGetCachedPage

## search results classes
class _SearchBase:
    def __init__(self, params):
        for k, v in params.items():
            if isinstance(v, SOAP.structType):
                v = v._asdict
            try:
                if isinstance(v[0], SOAP.structType):
                    v = [node._asdict for node in v]
            except:
                pass
            self.__dict__[str(k)] = v

class SearchResultsMetaData(_SearchBase):
    """metadata of search query results

    Available attributes:
    documentFiltering - flag indicates whether duplicate page filtering was perfomed in this search
    searchComments - human-readable informational message (example: "'the' is a very common word
        and was not included in your search")
    estimatedTotalResultsCount - estimated total number of results for this query
    estimateIsExact - flag indicates whether estimatedTotalResultsCount is an exact value
    searchQuery - search string that initiated this search
    startIndex - index of first result returned (zero-based)
    endIndex - index of last result returned (zero-based)
    searchTips - human-readable informational message on how to use Google bette
    directoryCategories - list of dictionaries like this:
        {'fullViewableName': Open Directory category,
         'specialEncoding': encoding scheme of this directory category}
    searchTime - total search time, in seconds
    """    
    pass

class SearchResult(_SearchBase):
    """search result

    Available attributes:
    URL - URL
    title - title (HTML)
    snippet - snippet showing query context (HTML)
    cachedSize - size of cached version of this result, (KB)
    relatedInformationPresent - flag indicates that the "related:" keyword is supported for this URL
    hostName: When filtering occurs, a maximum of two results from any given host is returned.
        When this occurs, the second resultElement that comes from that host contains
        the host name in this parameter.
    directoryCategory: dictionary like this:
        {'fullViewableName': Open Directory category,
         'specialEncoding': encoding scheme of this directory category}
    directoryTitle: Open Directory title of this result (or blank)
    summary - Open Directory summary for this result (or blank)
    """
    pass

class SearchReturnValue:
    """complete search results for a single query

    Available attributes:
    meta - SearchResultsMetaData
    results - list of SearchResult
    """
    def __init__(self, metadata, results):
        self.meta = metadata
        self.results = results

## main functions
def doGoogleSearch(q, start=0, maxResults=10, filter=1, restrict='',
                   safeSearch=0, language='', inputencoding='', outputencoding='',
                   license_key = None, http_proxy = None):
    """search Google

    You need a license key to call this function; see
    http://www.google.com/apis/ to get one.  Then you can either pass it to
    this function every time, or set it globally; see the module docs for details.
    
    Parameters:
    q - search string.  Anything you could type at google.com, you can pass here.
        See http://www.google.com/help/features.html for examples of advanced features.
    start (optional) - zero-based index of first desired result (for paging through
        multiple pages of results)
    maxResults (optional) - maximum number of results, currently capped at 10
    filter (optional) - set to 1 to filter out similar results, set to 0 to see everything
    restrict (optional) - restrict results by country or topic.  Examples:
        Ukraine - search only sites located in Ukraine
        linux - search Linux sites only
        mac - search Mac sites only
        bsd - search FreeBSD sites only
        See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
        for more advanced examples and a full list of country codes and topics.
    safeSearch (optional) - set to 1 to filter results with SafeSearch (no adult material)
    language (optional) - restricts search to documents in one or more languages.  Example:
        lang_en - only return pages in English
        lang_fr - only return pages in French
        See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
        for more advanced examples and a full list of language codes.
    inputencoding (optional) - sets the character encoding of q parameter
    outputencoding (optional) - sets the character encoding of the returned results
        See the APIs_reference.html file in the SDK (http://www.google.com/apis/download.html)
        for a full list of encodings.
    http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages

    Returns: SearchReturnValue
    .meta - SearchMetaData
    .results - list of SearchResult
    See documentation of these individual classes for list of available attributes
    """
    http_proxy = getProxy(http_proxy)
    remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
    license_key = getLicense(license_key)
    filter = _marshalBoolean(filter)
    safeSearch = _marshalBoolean(safeSearch)
    data = remoteserver.doGoogleSearch(license_key, q, start, maxResults, filter, restrict,
                                       safeSearch, language, inputencoding, outputencoding)
    metadata = data._asdict
    del metadata["resultElements"]
    metadata = SearchResultsMetaData(metadata)
    results = [SearchResult(node._asdict) for node in data.resultElements]
    return SearchReturnValue(metadata, results)

def doGetCachedPage(url, license_key = None, http_proxy = None):
    """get page from Google cache

    You need a license key to call this function; see
    http://www.google.com/apis/ to get one.  Then you can either pass it to
    this function every time, or set it globally; see the module docs for details.
    
    Parameters:
    url - address of page to get
    license_key (optional) - Google license key
    http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages

    Returns: string, text of cached page    
    """
    http_proxy = getProxy(http_proxy)
    remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
    license_key = getLicense(license_key)
    return remoteserver.doGetCachedPage(license_key, url)

def doSpellingSuggestion(phrase, license_key = None, http_proxy = None):
    """get spelling suggestions from Google

    You need a license key to call this function; see
    http://www.google.com/apis/ to get one.  Then you can either pass it to
    this function every time, or set it globally; see the module docs for details.
    
    Parameters:
    phrase - word or phrase to spell-check
    http_proxy (optional) - address of HTTP proxy to use for sending and receiving SOAP messages

    Returns: text of suggested replacement, or None
    """
    http_proxy = getProxy(http_proxy)
    remoteserver = SOAP.SOAPProxy(_url, namespace=_namespace, http_proxy=http_proxy)
    license_key = getLicense(license_key)
    return remoteserver.doSpellingSuggestion(license_key, phrase)

## functional test suite (see googletest.py for unit test suite)
def test():
    try:
        getLicense(None)
    except NoLicenseKey:
        return
    print "Searching for Python at google.com..."
    data = doGoogleSearch("Python")
    output(data, {"func": "doGoogleSearch"})

    print "\nSearching for 5 _French_ pages about Python, encoded in ISO-8859-1..."
    data = doGoogleSearch("Python", language='lang_fr', outputencoding='ISO-8859-1', maxResults=5)
    output(data, {"func": "doGoogleSearch"})

    phrase = "Pyhton programming languager"
    print "\nTesting spelling suggetions for '%s'..." % phrase
    data = doSpellingSuggestion(phrase)
    output(data, {"func": "doSpellingSuggestion"})

## main driver for command-line use
def main(argv):
    if not argv:
        usage()
        return
    q = None
    func = None
    http_proxy = None
    license_key = None
    feelingLucky = 0
    showMeta = 0
    reverseOrder = 0
    runTest = 0
    outputFormat = "text"
    try:
        opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1",
            ["search=", "cache=", "spelling=", "key=", "lucky", "meta", "reverse", "proxy=", "help", "version", "test"])
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-s", "--search"):
            q = arg
            func = "doGoogleSearch"
        elif opt in ("-c", "--cache"):
            q = arg
            func = "doGetCachedPage"
        elif opt in ("-p", "--spelling"):
            q = arg
            func = "doSpellingSuggestion"
        elif opt in ("-k", "--key"):
            license_key = arg
        elif opt in ("-l", "-1", "--lucky"):
            feelingLucky = 1
        elif opt in ("-m", "--meta"):
            showMeta = 1
        elif opt in ("-r", "--reverse"):
            reverseOrder = 1
        elif opt in ("-x", "--proxy"):
            http_proxy = arg
        elif opt in ("-h", "--help"):
            usage()
        elif opt in ("-v", "--version"):
            version()
        elif opt in ("-t", "--test"):
            runTest = 1
    if runTest:
        setLicense(license_key)
        setProxy(http_proxy)
        test()
    if args and not q:
        q = args[0]
        func = "doGoogleSearch"
    if func:
        results = globals()[func](q, http_proxy=http_proxy, license_key=license_key)
        output(results, locals())

if __name__ == '__main__':
    main(sys.argv[1:])