Project

General

Profile

« Previous | Next » 

Revision 28793

Added by Dominika Tkaczyk almost 10 years ago

Madis update

View differences:

similarity.py
117 117

  
118 118
jaccard.registered=True
119 119

  
120

  
121
def sorensendice(*args):
122
    """
123
    .. function:: sorensendice(jpack1,jpack2)
124

  
125
    Return jaccard similarity value of two jpacks.
126

  
127
    Example:
128

  
129
    >>> table1('''
130
    ... user1   movie1 20
131
    ... user1   movie2 30
132
    ... user2   movie1 40
133
    ... user2   movie3 90
134
    ... user2   movie4 90
135
    ... user3   movie1 40
136
    ... user3   movie3 80
137
    ... user4   movie1 70
138
    ... user4   movie2 10
139
    ... ''')
140

  
141
    NOTE that only column b is jgrouped because *jaccard* operates on packs as sets, not weighted values, So packing
142
    also column c would not make any difference.
143

  
144
    >>> sql(\"""select u1.userid,u2.userid, sorensendice(u1.pk, u2.pk) as similarity
145
    ...     from
146
    ...         (select a as userid, jgroup(b)  as pk from table1 group by a) as u1,
147
    ...         (select a as userid, jgroup(b) as pk from table1 group by a) as u2
148
    ...     where u1.userid<u2.userid\""")
149
    userid | userid | similarity
150
    ----------------------------
151
    user1  | user2  | 0.4
152
    user1  | user3  | 0.5
153
    user1  | user4  | 1.0
154
    user2  | user3  | 0.8
155
    user2  | user4  | 0.4
156
    user3  | user4  | 0.5
157
    """
158

  
159
    if len(args)!=2:
160
        raise functions.OperatorError("sorensendice","operator takes exactly two arguments")
161
    try:
162
        r=jopts.fromj(args[0])
163
        s=jopts.fromj(args[1])
164
    except Exception,e:
165
        raise functions.OperatorError("sorensendice"," Wrong format arguments: %s" %(e))
166
    rset=set([tuple(x) if type(x)==list else x for x in r])
167
    sset=set([tuple(x) if type(x)==list else x for x in s])
168

  
169
    return 2 * float(len( rset & sset ))/(len(rset) + len(sset) )
170

  
171
sorensendice.registered=True
172

  
120 173
#def euclean(*args):###not working with lists
121 174
#    """
122 175
#

Also available in: Unified diff