Cover | Table of Contents | Colophon
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
pow(n,2) function to square a
number and take the square root with the sqrt function:>>from math import sqrt
>>sqrt(pow(4.5-4,2)+pow(1-2,2))
1.1180339887498949>>1/(1+sqrt(pow(4.5-4,2)+pow(1-2,2))) 0.47213595499957939
topMatches.Critic | Similarity | Night | S.xNight | Lady | S.xLady | Luck | S.xLuck |
|---|---|---|---|---|---|---|---|
Rose | 0.99 | 3.0 | 2.97 | 2.5 | 2.48 | 3.0 | 2.97 |
Seymour | 0.38 | 3.0 | 1.14 | 3.0 | 1.14 | 1.5 | 0.57 |
Puig | 0.89 | 4.5 | 4.02 | 3.0 | 2.68 | ||
LaSalle | 0.92 | 3.0 | 2.77 | 3.0 | 2.77 | 2.0 | 1.85 |
Matthews | 0.66 | 3.0 | 1.99 | 3.0 | 1.99 | ||
Total | 12.89 | 8.38 | 8.07 | ||||
Sim.Sum | 3.84 | 2.95 | 3.18 | ||||
Total/Sim.
Sum | 3.35 | 2.83 | 2.53 |
{'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5}}{'Lady in the Water':{'Lisa Rose':2.5,'Gene Seymour':3.0},
'Snakes on a Plane':{'Lisa Rose':3.5,'Gene Seymour':3.5}} etc..def transformPrefs(prefs):
result={}
for person in prefs:
for item in prefs[person]:
result.setdefault(item,{})
# Flip item and person
result[item][person]=prefs[person][item]
return resulttopMatches
function used earlier to find the set of movies most similar to
Superman Returns:>>reload(recommendations) >> movies=recommendations.transformPrefs(recommendations.critics) >> recommendations.topMatches(movies,'Superman Returns') [(0.657, 'You, Me and Dupree'), (0.487, 'Lady in the Water'), (0.111, 'Snakes on a Plane'), (−0.179, 'The Night Listener'), (−0.422, 'Just My Luck')]
get_popular call:>>import pydelicious >> pydelicious.get_popular(tag='programming') [{'count': '', 'extended': '', 'hash': '', 'description': u'How To Write Unmaintainable Code', 'tags': '', 'href': u'http://thc.segfault.net/root/phun/ unmaintain.html', 'user': u'dorsia', 'dt': u'2006-08-19T09:48:56Z'}, {'count': '', 'extended': '', 'hash': '', 'description': u'Threading in C#', 'tags': '', 'href': u'http://www.albahari.com/threading/', 'user': u'mmihale', 'dt': u'2006-05-17T18: 09:24Z'},
def calculateSimilarItems(prefs,n=10):
# Create a dictionary of items showing which other items they
# are most similar to.
result={}
# Invert the preference matrix to be item-centric
itemPrefs=transformPrefs(prefs)
c=0
for item in itemPrefs:
# Status updates for large datasets
c+=1
if c%100==0: print "%d / %d" % (c,len(itemPrefs))
# Find the most similar items to this one
scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
result[item]=scores
return result196 242 3 881250949 186 302 3 891717742 22 377 1 878887116 244 51 2 880606923 166 346 1 886397596 298 474 4 884182806
loadMovieLens in recommendations.py to load this
dataset:def loadMovieLens(path='/data/movielens'):
# Get movie titles
movies={}
for line in open(path+'/u.item'):
(id,title)=line.split('|')[0:2]
movies[id]=title
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs>>>reload(recommendations) >>> prefs=recommendations.loadMovieLens( ) >>> prefs['87'] {'Birdcage, The (1996)': 4.0, 'E.T. the Extra-Terrestrial (1982)': 3.0, 'Bananas (1971)': 5.0, 'Sting, The (1973)': 5.0, 'Bad Boys (1995)': 4.0, 'In the Line of Fire (1993)': 5.0, 'Star Trek: The Wrath of Khan (1982)': 5.0, 'Speechless (1994)': 4.0, etc...
>>>recommendations.getRecommendations(prefs,'87')[0:30]"china" | "kids" | "music" | "yahoo" | |
|---|---|---|---|---|
Gothamist | 0 | 3 | 3 | 0 |
GigaOM | 6 | 0 | 0 | 2 |
Quick Online
Tips | 0 | 2 | 2 | 22 |
import statement to the beginning of clusters.py:from PIL import Image,ImageDraw
def getheight(clust): # Is this an endpoint? Then the height is just 1 if clust.left==None and clust.right==None: return 1 # Otherwise the height is the same of the heights of # each branch return getheight(clust.left)+getheight(clust.right)
def getdepth(clust): # The distance of an endpoint is 0.0 if clust.left==None and clust.right==None: return 0 # The distance of a branch is the greater of its two sides # plus its own distance return max(getdepth(clust.left),getdepth(clust.right))+clust.distance
def rotatematrix(data):
newdata=[]
for i in range(len(data[0])):
newrow=[data[j][i] for j in range(len(data))]
newdata.append(newrow)
return newdata>>reload(clusters) >> rdata=clusters.rotatematrix(data) >> wordclust=clusters.hcluster(rdata) >> clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')