/modules/uoa-iis-3rdparty-madis/trunk/src/main/resources/eu/dnetlib/iis/3rdparty/scripts/madis/functions/row/jpacks.py - Diff - D-Net - D-Net project tracking tool

« Previous | Next »

Revision 28793

Added by Dominika Tkaczyk almost 10 years ago

Madis update

     import itertools
     import re
     import functions
     import math
     try:
         from collections import OrderedDict
     except ImportError:
         # Python 2.6
         from lib.collections26 import OrderedDict
     def jpack(*args):
     def jngrams(*args):
         """
         .. function:: jpack(args...) -> jpack
         .. function:: jngrams(n,text) -> jpack
         Converts multiple input arguments into a single string. Jpacks preserve the types
         of their inputs and are based on JSON encoding. Single values are represented as
         themselves where possible.
         Converts multiple input arguments into a jpack of ngrams.
         Examples:
         >>> sql("select jpack('a')")
         jpack('a')
         ----------
+        a
         >>> sql("select jngrams(1,'This is a test phrase')")
         jngrams(1,'This is a test phrase')
         -------------------------------------------
         [["This"],["is"],["a"],["test"],["phrase"]]
         >>> sql("select jpack('a','b',3)")
         jpack('a','b',3)
         ----------------
         ["a","b",3]
         >>> sql("select jngrams(2,'This is a test phrase')")
         jngrams(2,'This is a test phrase')
         ---------------------------------------------------------
         [["This","is"],["is","a"],["a","test"],["test","phrase"]]
         >>> sql("select jpack('a', jpack('b',3))")
         jpack('a', jpack('b',3))
         ------------------------
         ["a",["b",3]]
         """
         if type(args[0]) == int:
             n = args[0]
             text = args[1]
         else:
             n = 1
             text = args[0]
         g = text.split(' ')
         listofngrams = []
         for i in xrange(len(g)-n+1):
             listofngrams.append(g[i:i+n])
         return jopts.toj(listofngrams)
         return jopts.toj(jopts.elemfromj(*args))
     jpack.registered=True
     jngrams.registered=True
     def jfrequentwords(*args):
         """
         .. function:: jfrequentwords(args...) -> jpack
         Returns the frequent words of a text in a jpack
         """
         wordslist = args[0].split(' ')
         setwords = set(wordslist)
         c = dict.fromkeys(setwords, 0)
         for w in wordslist:
             c[w]+=1
         lenwords = len(setwords)
         extremevals = int(math.ceil(lenwords * 3 * 1.0/100))
         frequences = sorted(c.values())[extremevals:(lenwords-extremevals)]
         avgfrequency = math.ceil(sum(frequences)*1.0/len(frequences))
         return jopts.toj([k for k,v in c.iteritems() if v >= avgfrequency])
     jfrequentwords.registered=True
     def jsonstrict(*args):
         """
-...
         """
         .. function:: jzip(args...) -> json string
         It combines the correspinding elements of input jpacks.
         It combines the corresponding elements of input jpacks.
         Examples:
-...
     jzip.registered=True
     def jzipdict(*args):
         """
         .. function:: jzipdict(args...) -> json string
         It combines the correspinding elements of input jpacks into a jdict.
         Examples:
         >>> sql('''select jzipdict('["a", "b"]', '[1,2]','[4,5]')''')
         jzipdict('["a", "b"]', '[1,2]','[4,5]')
         ---------------------------------------
         {"a":[1,4],"b":[2,5]}
         """
         return json.dumps(dict(tuple([x[0], x[1:]]) for x in zip(*jopts.elemfromj(*args))), separators=(',',':'), ensure_ascii=False)
     jzipdict.registered=True
     def jlen(*args):
         """
-...
     jmerge.registered=True
     def jset(*args):
         """
         .. function:: jset(jpacks) -> jpack
-...
         """
         return jopts.toj(sorted(set( jopts.fromj(*args) )))
         return jopts.toj(sorted(set(jopts.fromj(*args))))
     jset.registered=True
     jset.registered = True
     def jexcept(*args):
         """
         .. function:: jexcept(jpackA, jpackB) -> jpack
-...
         b = set(jopts.fromj(args[1]))
         return jopts.toj([x for x in jopts.fromj(args[0]) if x not in b])
     jexcept.registered=True
     jexcept.registered = True
     def jintersection(*args):
         """
         .. function:: jintersection(jpackA, jpackB) -> jpack
         Returns the items of jpackA except the items that appear on jpackB.
         Examples:
         >>> sql("select jintersection('[1,2,3]', '[1,2,3]')") # doctest: +NORMALIZE_WHITESPACE
         jintersection('[1,2,3]', '[1,2,3]')
         -----------------------------------
         [1,2,3]
         >>> sql("select jintersection('[1,2,3]', '[1,3]', 1)") # doctest: +NORMALIZE_WHITESPACE
         jintersection('[1,2,3]', '[1,3]', 1)
         ------------------------------------
         """
         if len(args) < 2:
             raise functions.OperatorError("jintersection","operator needs at least two inputs")
         return jopts.toj(sorted(set.intersection(*[set(jopts.fromj(x)) for x in args])))
     jintersection.registered = True
     def jsort(*args):
         """
-...
     def jdictsplit(*args):
         """
         .. function:: jdictvals(jdict, [key1, key2,..]) -> columns
         .. function:: jdictsplit(jdict, [key1, key2,..]) -> columns
         If only the first argument (jdict) is provided, it returns a row containing the values of input jdict (sorted by the jdict keys).
-...
     jdictsplit.registered=True
     def jdictsplitv(*args):
         """
         .. function:: jdictsplitv(jdict, [key1, key2,..]) -> columns
         If only the first argument (jdict) is provided, it returns rows containing the values of input jdict.
         If key values are also provided, it returns only the columns of which the keys have been provided.
         Examples:
         >>> sql(''' select jdictsplitv('{"k1":1,"k2":2}') ''') # doctest: +NORMALIZE_WHITESPACE
         key | val
         ---------
         k1  | 1
         k2  | 2
         >>> sql(''' select jdictsplitv('{"k1":1,"k2":2, "k3":3}', 'k3', 'k1', 'k4') ''') # doctest: +NORMALIZE_WHITESPACE
         key | val
         ---------
         k3  | 3
         k1  | 1
         """
         yield ('key', 'val')
         if len(args) == 1:
             dlist = json.loads(args[0], object_pairs_hook=OrderedDict)
             for k, v in dlist.iteritems():
                 yield [k, jopts.toj(v)]
         else:
             dlist = json.loads(args[0])
             for k in args[1:]:
                 try:
                     yield k, jopts.toj(dlist[k])
                 except KeyError:
                     pass
     jdictsplitv.registered = True
     def jdictgroupkey(*args):
         """
         .. function:: jdictgroupkey(list_of_jdicts, groupkey1, groupkey2, ...)

Also available in: Unified diff

Project

General

Profile

D-Net

Revision 28793

Added by Dominika Tkaczyk almost 10 years ago