"""Blogroll finder and aggregator"""
__author_ _ = "Mark Pilgrim (f8dy@diveintomark.org)"
__copyright_ _ = "Copyright 2002, Mark Pilgrim"
__license_ _ = "Python"
try:
import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
timeoutsocket.setDefaultSocketTimeout(10)
except:
pass
import urllib, urlparse, os, time, operator, sys, pickle, re, cgi, time
from sgmllib import SGMLParser
from threading import *
BUFFERSIZE = 1024
IGNOREEXTS = ('.xml', '.opml', '.rss', '.rdf', '.pdf', '.doc')
INCLUDEEXTS = ('', '.html', '.htm', '.shtml', '.php', '.asp', '.jsp')
IGNOREDOMAINS = ('cgi.alexa.com', 'adserver1.backbeatmedia.com', 'ask.slashdot.org',
'freshmeat.net', 'readroom.ipl.org', 'amazon.com', 'ringsurf.com')
def prettyURL(url):
protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
if path == '/':
path = ''
return urlparse.urlunparse(('', domain, path, '', '', '')).replace('//', '')
def simplifyURL(url):
url = url.replace('www.', '')
url = url.replace('/coming.html', '/')
protocol, domain, path, params, query, fragment = urlparse.urlparse(url)
if path == '':
url = url + '/'
return url
class MinimalURLOpener(urllib.FancyURLopener):
def __init_ _(self, *args):
apply(urllib.FancyURLopener.__init_ _, (self,) + args)
self.addheaders = [('User-agent', '')]
def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
pass
class BlogrollParser(SGMLParser):
def __init_ _(self, url):
SGMLParser.__init_ _(self)
self.url = url
self.reset( )
def reset(self):
SGMLParser.reset(self)
self.possible = []
self.blogroll = []
self.ina = 0
def _goodlink(self, href):
protocol, domain, path, params, query, fragment = urlparse.urlparse(href)
if protocol.lower( ) <> 'http': return 0
if self.url.find(domain) <> -1: return 0
if domain in IGNOREDOMAINS: return 0
if domain.find(':5335') <> -1: return 0
if domain.find('.google') <> -1: return 0
if fragment: return 0
shortpath, ext = os.path.splitext(path)
ext = ext.lower( )
if ext in INCLUDEEXTS: return 1
if ext.lower( ) in IGNOREEXTS: return 0
# more rules here?
return 1
def _confirmpossibles(self):
if len(self.possible) >= 4:
for url in self.possible:
if url not in self.blogroll:
self.blogroll.append(url)
self.possible = []
def start_a(self, attrs):
self.ina = 1
hreflist = [e[1] for e in attrs if e[0]=='href']
if not hreflist: return
href = simplifyURL(hreflist[0])
if self._goodlink(href):
self.possible.append(href)
def end_a(self):
self.ina = 0
def handle_data(self, data):
if self.ina: return
if data.strip( ):
self._confirmpossibles( )
def end_html(self, attrs):
self.confirmpossibles( )
def getRadioBlogroll(url):
try:
usock = MinimalURLOpener( ).open('%s/gems/mySubscriptions.opml' % url)
opmlSource = usock.read( )
usock.close( )
except:
return []
if opmlSource.find('<opml') == -1: return []
radioBlogroll = []
start = 0
while 1:
p = opmlSource.find('htmlUrl="', start)
if p == -1: break
refurl = opmlSource[p:p+100].split('"')[1]
radioBlogroll.append(refurl)
start = p + len(refurl) + 10
return radioBlogroll
def getBlogroll(url):
if url[:7] <> 'http://':
url = 'http://' + url
radioBlogroll = getRadioBlogroll(url)
if radioBlogroll:
return radioBlogroll
parser = BlogrollParser(url)
try:
usock = MinimalURLOpener( ).open(url)
htmlSource = usock.read( )
usock.close( )
except:
return []
parser.feed(htmlSource)
return parser.blogroll
class BlogrollThread(Thread):
def __init_ _(self, master, url):
Thread.__init_ _(self)
self.master = master
self.url = url
def run(self):
self.master.callback(self.url, getBlogroll(self.url))
class BlogrollThreadMaster:
def __init_ _(self, url, recurse):
self.blogrollDict = {}
self.done = 0
if type(url)==type(''):
blogroll = getBlogroll(url)
else:
blogroll = url
self.run(blogroll, recurse)
def callback(self, url, blogroll):
if not self.done:
self.blogrollDict[url] = blogroll
def run(self, blogroll, recurse):
start = 0
end = 5
while 1:
threads = []
for url in blogroll[start:end]:
if not self.blogrollDict.has_key(url):
t = BlogrollThread(self, url)
threads.append(t)
for t in threads:
t.start( )
time.sleep(0.000001)
for t in threads:
time.sleep(0.000001)
t.join(10)
start += 5
end += 5
if start > len(blogroll): break
if recurse > 1:
masterlist = reduce(operator.add, self.blogrollDict.values( ))
newlist = [url for url in masterlist if not self.blogrollDict. has_key(url)]
self.run(newlist, recurse - 1)
else:
self.done = 1
def sortBlogrollData(blogrollDict):
sortD = {}
for blogroll in blogrollDict.values( ):
for url in blogroll:
sortD[url] = sortD.setdefault(url, 0) + 1
sortI = [(v, k) for k, v in sortD.items( )]
sortI.sort( )
sortI.reverse( )
return sortI
def trimdata(sortI, cutoff):
return [(c, url) for c, url in sortI if c >= cutoff]
def getRelated(url):
import google
results = []
start = 0
for i in range(3):
data = google.doGoogleSearch('related:%s' % url, start)
results.extend([oneResult.URL for oneResult in data.results])
start += 10
if len(data.results) < 10: break
return results
def getNeighborhood(baseURL):
relatedList = getRelated(baseURL)
blogrollDict = BlogrollThreadMaster(relatedList, 1).blogrollDict
neighborhood = sortBlogrollData(blogrollDict)
neighborhood = trimdata(neighborhood, 2)
neighborhood = [(c,url, prettyURL(url)) for c,url in neighborhood]
return neighborhood
def render_html(baseURL, data):
output = []
output.append("""
<table class="socialnetwork" summary="neighborhood for %s">
<caption>Neighborhood for %s</caption>
<thead>
<tr>
<th scope="col">Name</th>
<th scope="col">Links</th>
<th shope="col">Explore</th>
</tr>
</thead>
<tbody>""" % (cgi.escape(prettyURL(baseURL)), cgi.escape(prettyURL(baseURL))))
for c, url, title in data:
output.append("""<tr><td><a href="%s">%s</a></td>
<td>%s</td><td><a href="%s">explore</a></td></tr
>""" % (url, title, c, 'http://diveintomark.org/cgi-bin/neighborhood.cgi?url=%s' %
cgi.escape(url)))
output.append("""
</tbody>
</table>""")
return "".join(output)
def render_rss(baseURL, data):
title = prettyURL(baseURL)
channeltitle = "%s neighborhood" % title
localtime = time.strftime('%Y-%m-%dT%H:%M:%S-05:00', time.localtime( ))
output = []
output.append("""<?xml version="1.0"?>
<rdf:RDF xmlns="http://purl.org/rss/1.0/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/
elements/1.1/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin=
"http://webns.net/mvcb/">
<channel rdf:about="%(baseURL)s">
<title>%(channeltitle)s</title>
<link>%(baseURL)s</link>
<description>Sites in the virtual neighborhood of %(title)s</description>
<language>en-us</language>
<lastBuildDate>%(localtime)s</lastBuildDate>
<pubDate>%(localtime)s</pubDate>
<admin:generatorAgent rdf:resource="http://divintomark.org/cgi-bin/neighborhood.cgi/
?v=1.1" />
<admin:errorReportsTo rdf:resource="mailto:f8dy@diveintomark.org"/>
<sy:updatePeriod>weekly</sy:updatePeriod>
<sy:updateFrequency>1</sy:updateFrequency>
<sy:updateBase>2000-01-01T12:00+00:00</sy:updateBase>
<items>
<rdf:Seq>
""" % locals( ))
##"""
for c, url, title in data:
output.append("""<rdf:li rdf:resource="%s" />
""" % url)
output.append("""</rdf:Seq>
</items>
</channel>
""")
for c, url, title in data:
output.append("""<item rdf:about="%(url)s">
<title>%(title)s</title>
<link>%(url)s</link>
<description>%(c)s links</description>
</item>
""" % locals( ))
output.append("""</rdf:RDF>""")
return "".join(output)
if __name__ == '__main_ _':
print render_html(getNeighborhood(sys.argv[1]))