Revision 28793
Added by Dominika Tkaczyk almost 10 years ago
similarity.py | ||
---|---|---|
117 | 117 |
|
118 | 118 |
jaccard.registered=True |
119 | 119 |
|
120 |
|
|
121 |
def sorensendice(*args): |
|
122 |
""" |
|
123 |
.. function:: sorensendice(jpack1,jpack2) |
|
124 |
|
|
125 |
Return jaccard similarity value of two jpacks. |
|
126 |
|
|
127 |
Example: |
|
128 |
|
|
129 |
>>> table1(''' |
|
130 |
... user1 movie1 20 |
|
131 |
... user1 movie2 30 |
|
132 |
... user2 movie1 40 |
|
133 |
... user2 movie3 90 |
|
134 |
... user2 movie4 90 |
|
135 |
... user3 movie1 40 |
|
136 |
... user3 movie3 80 |
|
137 |
... user4 movie1 70 |
|
138 |
... user4 movie2 10 |
|
139 |
... ''') |
|
140 |
|
|
141 |
NOTE that only column b is jgrouped because *jaccard* operates on packs as sets, not weighted values, So packing |
|
142 |
also column c would not make any difference. |
|
143 |
|
|
144 |
>>> sql(\"""select u1.userid,u2.userid, sorensendice(u1.pk, u2.pk) as similarity |
|
145 |
... from |
|
146 |
... (select a as userid, jgroup(b) as pk from table1 group by a) as u1, |
|
147 |
... (select a as userid, jgroup(b) as pk from table1 group by a) as u2 |
|
148 |
... where u1.userid<u2.userid\""") |
|
149 |
userid | userid | similarity |
|
150 |
---------------------------- |
|
151 |
user1 | user2 | 0.4 |
|
152 |
user1 | user3 | 0.5 |
|
153 |
user1 | user4 | 1.0 |
|
154 |
user2 | user3 | 0.8 |
|
155 |
user2 | user4 | 0.4 |
|
156 |
user3 | user4 | 0.5 |
|
157 |
""" |
|
158 |
|
|
159 |
if len(args)!=2: |
|
160 |
raise functions.OperatorError("sorensendice","operator takes exactly two arguments") |
|
161 |
try: |
|
162 |
r=jopts.fromj(args[0]) |
|
163 |
s=jopts.fromj(args[1]) |
|
164 |
except Exception,e: |
|
165 |
raise functions.OperatorError("sorensendice"," Wrong format arguments: %s" %(e)) |
|
166 |
rset=set([tuple(x) if type(x)==list else x for x in r]) |
|
167 |
sset=set([tuple(x) if type(x)==list else x for x in s]) |
|
168 |
|
|
169 |
return 2 * float(len( rset & sset ))/(len(rset) + len(sset) ) |
|
170 |
|
|
171 |
sorensendice.registered=True |
|
172 |
|
|
120 | 173 |
#def euclean(*args):###not working with lists |
121 | 174 |
# """ |
122 | 175 |
# |
Also available in: Unified diff
Madis update