Project

General

Profile

« Previous | Next » 

Revision 28793

Added by Dominika Tkaczyk almost 10 years ago

Madis update

View differences:

jpacks.py
6 6
import itertools
7 7
import re
8 8
import functions
9
import math
9 10
try:
10 11
    from collections import OrderedDict
11 12
except ImportError:
12 13
    # Python 2.6
13 14
    from lib.collections26 import OrderedDict
14 15

  
15
def jpack(*args):
16
def jngrams(*args):
16 17

  
17 18
    """
18
    .. function:: jpack(args...) -> jpack
19
    .. function:: jngrams(n,text) -> jpack
19 20

  
20
    Converts multiple input arguments into a single string. Jpacks preserve the types
21
    of their inputs and are based on JSON encoding. Single values are represented as
22
    themselves where possible.
21
    Converts multiple input arguments into a jpack of ngrams.
23 22

  
24 23
    Examples:
25 24

  
26
    >>> sql("select jpack('a')")
27
    jpack('a')
28
    ----------
29
    a
25
    >>> sql("select jngrams(1,'This is a test phrase')")
26
    jngrams(1,'This is a test phrase')
27
    -------------------------------------------
28
    [["This"],["is"],["a"],["test"],["phrase"]]
30 29

  
31
    >>> sql("select jpack('a','b',3)")
32
    jpack('a','b',3)
33
    ----------------
34
    ["a","b",3]
30
    >>> sql("select jngrams(2,'This is a test phrase')")
31
    jngrams(2,'This is a test phrase')
32
    ---------------------------------------------------------
33
    [["This","is"],["is","a"],["a","test"],["test","phrase"]]
35 34

  
36
    >>> sql("select jpack('a', jpack('b',3))")
37
    jpack('a', jpack('b',3))
38
    ------------------------
39
    ["a",["b",3]]
40 35

  
41 36
    """
37
    if type(args[0]) == int:
38
        n = args[0]
39
        text = args[1]
40
    else:
41
        n = 1
42
        text = args[0]
43
    g = text.split(' ')
44
    listofngrams = []
45
    for i in xrange(len(g)-n+1):
46
        listofngrams.append(g[i:i+n])
47
    return jopts.toj(listofngrams)
42 48

  
43
    return jopts.toj(jopts.elemfromj(*args))
44 49

  
45
jpack.registered=True
50
jngrams.registered=True
46 51

  
52

  
53

  
54
def jfrequentwords(*args):
55

  
56
    """
57
    .. function:: jfrequentwords(args...) -> jpack
58

  
59
    Returns the frequent words of a text in a jpack
60

  
61
    """
62
    wordslist = args[0].split(' ')
63
    setwords = set(wordslist)
64
    c = dict.fromkeys(setwords, 0)
65
    for w in wordslist:
66
        c[w]+=1
67
    lenwords = len(setwords)
68
    extremevals = int(math.ceil(lenwords * 3 * 1.0/100))
69
    frequences = sorted(c.values())[extremevals:(lenwords-extremevals)]
70
    avgfrequency = math.ceil(sum(frequences)*1.0/len(frequences))
71

  
72
    return jopts.toj([k for k,v in c.iteritems() if v >= avgfrequency])
73

  
74
jfrequentwords.registered=True
75

  
47 76
def jsonstrict(*args):
48 77

  
49 78
    """
......
80 109
    """
81 110
    .. function:: jzip(args...) -> json string
82 111

  
83
    It combines the correspinding elements of input jpacks.
112
    It combines the corresponding elements of input jpacks.
84 113

  
85 114
    Examples:
86 115

  
......
94 123

  
95 124
jzip.registered=True
96 125

  
126
def jzipdict(*args):
127

  
128
    """
129
    .. function:: jzipdict(args...) -> json string
130

  
131
    It combines the correspinding elements of input jpacks into a jdict.
132

  
133
    Examples:
134

  
135
    >>> sql('''select jzipdict('["a", "b"]', '[1,2]','[4,5]')''')
136
    jzipdict('["a", "b"]', '[1,2]','[4,5]')
137
    ---------------------------------------
138
    {"a":[1,4],"b":[2,5]}
139

  
140
    """
141
    return json.dumps(dict(tuple([x[0], x[1:]]) for x in zip(*jopts.elemfromj(*args))), separators=(',',':'), ensure_ascii=False)
142

  
143
jzipdict.registered=True
144

  
97 145
def jlen(*args):
98 146

  
99 147
    """
......
359 407

  
360 408
jmerge.registered=True
361 409

  
410

  
362 411
def jset(*args):
363 412
    """
364 413
    .. function:: jset(jpacks) -> jpack
......
374 423

  
375 424
    """
376 425

  
377
    return jopts.toj(sorted(set( jopts.fromj(*args) )))
426
    return jopts.toj(sorted(set(jopts.fromj(*args))))
378 427

  
379
jset.registered=True
428
jset.registered = True
380 429

  
430

  
381 431
def jexcept(*args):
382 432
    """
383 433
    .. function:: jexcept(jpackA, jpackB) -> jpack
......
404 454
    b = set(jopts.fromj(args[1]))
405 455
    return jopts.toj([x for x in jopts.fromj(args[0]) if x not in b])
406 456

  
407
jexcept.registered=True
457
jexcept.registered = True
408 458

  
409 459

  
460
def jintersection(*args):
461
    """
462
    .. function:: jintersection(jpackA, jpackB) -> jpack
463

  
464
    Returns the items of jpackA except the items that appear on jpackB.
465

  
466
    Examples:
467

  
468
    >>> sql("select jintersection('[1,2,3]', '[1,2,3]')") # doctest: +NORMALIZE_WHITESPACE
469
    jintersection('[1,2,3]', '[1,2,3]')
470
    -----------------------------------
471
    [1,2,3]
472

  
473
    >>> sql("select jintersection('[1,2,3]', '[1,3]', 1)") # doctest: +NORMALIZE_WHITESPACE
474
    jintersection('[1,2,3]', '[1,3]', 1)
475
    ------------------------------------
476
    1
477

  
478
    """
479

  
480
    if len(args) < 2:
481
        raise functions.OperatorError("jintersection","operator needs at least two inputs")
482

  
483
    return jopts.toj(sorted(set.intersection(*[set(jopts.fromj(x)) for x in args])))
484

  
485
jintersection.registered = True
486

  
487

  
410 488
def jsort(*args):
411 489

  
412 490
    """
......
746 824
def jdictsplit(*args):
747 825

  
748 826
    """
749
    .. function:: jdictvals(jdict, [key1, key2,..]) -> columns
827
    .. function:: jdictsplit(jdict, [key1, key2,..]) -> columns
750 828

  
751 829
    If only the first argument (jdict) is provided, it returns a row containing the values of input jdict (sorted by the jdict keys).
752 830

  
......
785 863

  
786 864
jdictsplit.registered=True
787 865

  
866

  
867
def jdictsplitv(*args):
868

  
869
    """
870
    .. function:: jdictsplitv(jdict, [key1, key2,..]) -> columns
871

  
872
    If only the first argument (jdict) is provided, it returns rows containing the values of input jdict.
873

  
874
    If key values are also provided, it returns only the columns of which the keys have been provided.
875

  
876
    Examples:
877

  
878
    >>> sql(''' select jdictsplitv('{"k1":1,"k2":2}') ''') # doctest: +NORMALIZE_WHITESPACE
879
    key | val
880
    ---------
881
    k1  | 1
882
    k2  | 2
883

  
884
    >>> sql(''' select jdictsplitv('{"k1":1,"k2":2, "k3":3}', 'k3', 'k1', 'k4') ''') # doctest: +NORMALIZE_WHITESPACE
885
    key | val
886
    ---------
887
    k3  | 3
888
    k1  | 1
889

  
890
    """
891

  
892
    yield ('key', 'val')
893
    if len(args) == 1:
894
        dlist = json.loads(args[0], object_pairs_hook=OrderedDict)
895
        for k, v in dlist.iteritems():
896
            yield [k, jopts.toj(v)]
897
    else:
898
        dlist = json.loads(args[0])
899
        for k in args[1:]:
900
            try:
901
                yield k, jopts.toj(dlist[k])
902
            except KeyError:
903
                pass
904

  
905
jdictsplitv.registered = True
906

  
788 907
def jdictgroupkey(*args):
789 908
    """
790 909
    .. function:: jdictgroupkey(list_of_jdicts, groupkey1, groupkey2, ...)

Also available in: Unified diff