O'Reilly Hacks
oreilly.comO'Reilly NetworkSafari BookshelfConferences Sign In/My Account | View Cart   
Book List Learning Lab PDFs O'Reilly Gear Newsletters Press Room Jobs  



HACK
#39
Meander Your Google Neighborhood
Google Neighborhood attempts to detangle the Web by building a "neighborhood" of sites around a URL
The Code
[Discuss (0) | Link to this hack]

The Code

Google Neighborhood is written in the Python (http://www.python.org) programming language. Your system will need to have Python installed for you to run this hack.

"""Blogroll finder and aggregator"""

__author_  _ = "Mark Pilgrim (f8dy@diveintomark.org)"
__copyright_  _ = "Copyright 2002, Mark Pilgrim"
__license_  _ = "Python"

try:
    import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
    timeoutsocket.setDefaultSocketTimeout(10)
except:
    pass
import urllib, urlparse, os, time, operator, sys, pickle, re, cgi, time
from sgmllib import SGMLParser
from threading import *

BUFFERSIZE = 1024
IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc')
INCLUDEEXTS = ('', '.html', '.htm', '.shtml', '.php', '.asp', '.jsp')
IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 'ask.slashdot.org', 
'freshmeat.net', 'readroom.ipl.org', 'amazon.com', 'ringsurf.com')

def prettyURL(url):
    protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
    if path == '/':
        path = ''
    return urlparse.urlunparse(('', domain, path, '', '', '')).replace('//', '')
    
def simplifyURL(url):
    url = url.replace('www.', '')
    url = url.replace('/coming.html', '/')
    protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
    if path == '':
        url = url + '/'
    return url

class MinimalURLOpener(urllib.FancyURLopener):
    def __init_  _(self, *args):
        apply(urllib.FancyURLopener.__init_  _, (self,) + args)
        self.addheaders = [('User-agent', '')]
    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
        pass

class BlogrollParser(SGMLParser):
    def __init_  _(self, url):
        SGMLParser.__init_  _(self)
        self.url = url
        self.reset( )
        
    def reset(self):
        SGMLParser.reset(self)
        self.possible = []
        self.blogroll = []
        self.ina = 0
    
    def _goodlink(self, href):
        protocol, domain, path, params, query, fragment = urlparse.urlparse(href)
        if protocol.lower( ) <> 'http': return 0
        if self.url.find(domain) <> -1: return 0
        if domain in IGNOREDOMAINS: return 0
        if domain.find(':5335') <> -1: return 0
        if domain.find('.google') <> -1: return 0
        if fragment: return 0
        shortpath, ext = os.path.splitext(path)
        ext = ext.lower( )
        if ext in INCLUDEEXTS: return 1
        if ext.lower( ) in IGNOREEXTS: return 0
        # more rules here?
        return 1
    
    def _confirmpossibles(self):
        if len(self.possible) >= 4:
            for url in self.possible:
                if url not in self.blogroll:
                    self.blogroll.append(url)
        self.possible = []
    
    def start_a(self, attrs):
        self.ina = 1
        hreflist = [e[1] for e in attrs if e[0]=='href']
        if not hreflist: return
        href = simplifyURL(hreflist[0])
        if self._goodlink(href):
            self.possible.append(href)

    def end_a(self):
        self.ina = 0
    
    def handle_data(self, data):
        if self.ina: return
        if data.strip( ):
            self._confirmpossibles( )
        
    def end_html(self, attrs):
        self.confirmpossibles( )

def getRadioBlogroll(url):
    try:
        usock = MinimalURLOpener( ).open('%s/gems/mySubscriptions.opml' % url)
        opmlSource = usock.read( )
        usock.close( )
    except:
        return []
    if opmlSource.find('<opml') == -1: return []
    radioBlogroll = []
    start = 0
    while 1:
        p = opmlSource.find('htmlUrl="', start)
        if p == -1: break
        refurl = opmlSource[p:p+100].split('"')[1]
        radioBlogroll.append(refurl)
        start = p + len(refurl) + 10
    return radioBlogroll

def getBlogroll(url):
    if url[:7] <> 'http://':
        url = 'http://' + url
    radioBlogroll = getRadioBlogroll(url)
    if radioBlogroll:
        return radioBlogroll
    parser = BlogrollParser(url)
    try:
        usock = MinimalURLOpener( ).open(url)
        htmlSource = usock.read( )
        usock.close( )
    except:
        return []
    parser.feed(htmlSource)
    return parser.blogroll

class BlogrollThread(Thread):
    def __init_  _(self, master, url):
        Thread.__init_  _(self)
        self.master = master
        self.url = url

    def run(self):
        self.master.callback(self.url, getBlogroll(self.url))

class BlogrollThreadMaster:
    def __init_  _(self, url, recurse):
        self.blogrollDict = {}
        self.done = 0
        if type(url)==type(''):
            blogroll = getBlogroll(url)
        else:
            blogroll = url
        self.run(blogroll, recurse)
    
    def callback(self, url, blogroll):
        if not self.done:
            self.blogrollDict[url] = blogroll
    
    def run(self, blogroll, recurse):
        start = 0
        end = 5
        while 1:
            threads = []
            for url in blogroll[start:end]:
                if not self.blogrollDict.has_key(url):
                    t = BlogrollThread(self, url)
                    threads.append(t)
            for t in threads:
                t.start( )
                time.sleep(0.000001)
            for t in threads:
                time.sleep(0.000001)
                t.join(10)
            start += 5
            end += 5
            if start > len(blogroll): break
        if recurse > 1:
            masterlist = reduce(operator.add, self.blogrollDict.values( ))
            newlist = [url for url in masterlist if not self.blogrollDict.            has_key(url)]
            self.run(newlist, recurse - 1)
        else:
            self.done = 1

def sortBlogrollData(blogrollDict):
    sortD = {}
    for blogroll in blogrollDict.values( ):
        for url in blogroll:
            sortD[url] = sortD.setdefault(url, 0) + 1
    sortI = [(v, k) for k, v in sortD.items( )]
    sortI.sort( )
    sortI.reverse( )
    return sortI

def trimdata(sortI, cutoff):
    return [(c, url) for c, url in sortI if c >= cutoff]

def getRelated(url):
    import google
    results = []
    start = 0
    for i in range(3):
        data = google.doGoogleSearch('related:%s' % url, start)
        results.extend([oneResult.URL for oneResult in data.results])
        start += 10
        if len(data.results) < 10: break
    return results

def getNeighborhood(baseURL):
    relatedList = getRelated(baseURL)
    blogrollDict = BlogrollThreadMaster(relatedList, 1).blogrollDict
    neighborhood = sortBlogrollData(blogrollDict)
    neighborhood = trimdata(neighborhood, 2)
    neighborhood = [(c,url, prettyURL(url)) for c,url in neighborhood]
    return neighborhood
    
def render_html(baseURL, data):
    output = []
    output.append("""
<table class="socialnetwork" summary="neighborhood for %s">
<caption>Neighborhood for %s</caption>
<thead>
<tr>
<th scope="col">Name</th>
<th scope="col">Links</th>
<th shope="col">Explore</th>
</tr>
</thead>
<tbody>""" % (cgi.escape(prettyURL(baseURL)), cgi.escape(prettyURL(baseURL))))
    for c, url, title in data:
        output.append("""<tr><td><a href="%s">%s</a></td>
<td>%s</td><td><a href="%s">explore</a></td></tr
>""" % (url, title, c, 'http://diveintomark.org/cgi-bin/neighborhood.cgi?url=%s' % 
cgi.escape(url)))
    output.append("""
</tbody>
</table>""")
    return "".join(output)

def render_rss(baseURL, data):
    title = prettyURL(baseURL)
    channeltitle = "%s neighborhood" % title
    localtime = time.strftime('%Y-%m-%dT%H:%M:%S-05:00', time.localtime( ))
    output = []
    output.append("""<?xml version="1.0"?>
<rdf:RDF xmlns="http://purl.org/rss/1.0/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/
elements/1.1/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin=
"http://webns.net/mvcb/">

<channel rdf:about="%(baseURL)s">
<title>%(channeltitle)s</title>
<link>%(baseURL)s</link>
<description>Sites in the virtual neighborhood of %(title)s</description>
<language>en-us</language>
<lastBuildDate>%(localtime)s</lastBuildDate>
<pubDate>%(localtime)s</pubDate>
<admin:generatorAgent rdf:resource="http://divintomark.org/cgi-bin/neighborhood.cgi/
?v=1.1" />
<admin:errorReportsTo rdf:resource="mailto:f8dy@diveintomark.org"/>
<sy:updatePeriod>weekly</sy:updatePeriod>
<sy:updateFrequency>1</sy:updateFrequency>
<sy:updateBase>2000-01-01T12:00+00:00</sy:updateBase>
<items>
<rdf:Seq>
""" % locals( ))
##"""
    for c, url, title in data:
        output.append("""<rdf:li rdf:resource="%s" />
""" % url)
    output.append("""</rdf:Seq>
</items>
</channel>
""")
    for c, url, title in data:
        output.append("""<item rdf:about="%(url)s">
<title>%(title)s</title>
<link>%(url)s</link>
<description>%(c)s links</description>
</item>
""" % locals( ))
    output.append("""</rdf:RDF>""")
    return "".join(output)
                      
if __name__ == '__main_  _':
    print render_html(getNeighborhood(sys.argv[1]))

You'll also need an HTML form to call the neighborhood.cgi script. Here's a simple one:

<form action="/cgi-bin/neighborhood.cgi" method="get">
URL: <input name="url" type="text" />
<br />
Output as: <input name="fl" type="radio" value="html" checked="true" /> HTML
<input name="fl" type="radio" value="rss" checked="true" /> RSS
<br />
<input type="submit" value="Meander" />
</form>

Save the form as neighborhood.html, being sure to alter the action= to point at the location in which you installed the CGI script ["How to Run the Scripts" in the Preface].


O'Reilly Home | Privacy Policy

© 2007 O'Reilly Media, Inc.
Website: | Customer Service: | Book issues:

All trademarks and registered trademarks appearing on oreilly.com are the property of their respective owners.