Revision 28793
Added by Dominika Tkaczyk almost 10 years ago
jpacks.py | ||
---|---|---|
6 | 6 |
import itertools |
7 | 7 |
import re |
8 | 8 |
import functions |
9 |
import math |
|
9 | 10 |
try: |
10 | 11 |
from collections import OrderedDict |
11 | 12 |
except ImportError: |
12 | 13 |
# Python 2.6 |
13 | 14 |
from lib.collections26 import OrderedDict |
14 | 15 |
|
15 |
def jpack(*args):
|
|
16 |
def jngrams(*args):
|
|
16 | 17 |
|
17 | 18 |
""" |
18 |
.. function:: jpack(args...) -> jpack
|
|
19 |
.. function:: jngrams(n,text) -> jpack
|
|
19 | 20 |
|
20 |
Converts multiple input arguments into a single string. Jpacks preserve the types |
|
21 |
of their inputs and are based on JSON encoding. Single values are represented as |
|
22 |
themselves where possible. |
|
21 |
Converts multiple input arguments into a jpack of ngrams. |
|
23 | 22 |
|
24 | 23 |
Examples: |
25 | 24 |
|
26 |
>>> sql("select jpack('a')")
|
|
27 |
jpack('a')
|
|
28 |
---------- |
|
29 |
a
|
|
25 |
>>> sql("select jngrams(1,'This is a test phrase')")
|
|
26 |
jngrams(1,'This is a test phrase')
|
|
27 |
-------------------------------------------
|
|
28 |
[["This"],["is"],["a"],["test"],["phrase"]]
|
|
30 | 29 |
|
31 |
>>> sql("select jpack('a','b',3)")
|
|
32 |
jpack('a','b',3)
|
|
33 |
---------------- |
|
34 |
["a","b",3]
|
|
30 |
>>> sql("select jngrams(2,'This is a test phrase')")
|
|
31 |
jngrams(2,'This is a test phrase')
|
|
32 |
---------------------------------------------------------
|
|
33 |
[["This","is"],["is","a"],["a","test"],["test","phrase"]]
|
|
35 | 34 |
|
36 |
>>> sql("select jpack('a', jpack('b',3))") |
|
37 |
jpack('a', jpack('b',3)) |
|
38 |
------------------------ |
|
39 |
["a",["b",3]] |
|
40 | 35 |
|
41 | 36 |
""" |
37 |
if type(args[0]) == int: |
|
38 |
n = args[0] |
|
39 |
text = args[1] |
|
40 |
else: |
|
41 |
n = 1 |
|
42 |
text = args[0] |
|
43 |
g = text.split(' ') |
|
44 |
listofngrams = [] |
|
45 |
for i in xrange(len(g)-n+1): |
|
46 |
listofngrams.append(g[i:i+n]) |
|
47 |
return jopts.toj(listofngrams) |
|
42 | 48 |
|
43 |
return jopts.toj(jopts.elemfromj(*args)) |
|
44 | 49 |
|
45 |
jpack.registered=True
|
|
50 |
jngrams.registered=True
|
|
46 | 51 |
|
52 |
|
|
53 |
|
|
54 |
def jfrequentwords(*args): |
|
55 |
|
|
56 |
""" |
|
57 |
.. function:: jfrequentwords(args...) -> jpack |
|
58 |
|
|
59 |
Returns the frequent words of a text in a jpack |
|
60 |
|
|
61 |
""" |
|
62 |
wordslist = args[0].split(' ') |
|
63 |
setwords = set(wordslist) |
|
64 |
c = dict.fromkeys(setwords, 0) |
|
65 |
for w in wordslist: |
|
66 |
c[w]+=1 |
|
67 |
lenwords = len(setwords) |
|
68 |
extremevals = int(math.ceil(lenwords * 3 * 1.0/100)) |
|
69 |
frequences = sorted(c.values())[extremevals:(lenwords-extremevals)] |
|
70 |
avgfrequency = math.ceil(sum(frequences)*1.0/len(frequences)) |
|
71 |
|
|
72 |
return jopts.toj([k for k,v in c.iteritems() if v >= avgfrequency]) |
|
73 |
|
|
74 |
jfrequentwords.registered=True |
|
75 |
|
|
47 | 76 |
def jsonstrict(*args): |
48 | 77 |
|
49 | 78 |
""" |
... | ... | |
80 | 109 |
""" |
81 | 110 |
.. function:: jzip(args...) -> json string |
82 | 111 |
|
83 |
It combines the correspinding elements of input jpacks.
|
|
112 |
It combines the corresponding elements of input jpacks.
|
|
84 | 113 |
|
85 | 114 |
Examples: |
86 | 115 |
|
... | ... | |
94 | 123 |
|
95 | 124 |
jzip.registered=True |
96 | 125 |
|
126 |
def jzipdict(*args): |
|
127 |
|
|
128 |
""" |
|
129 |
.. function:: jzipdict(args...) -> json string |
|
130 |
|
|
131 |
It combines the correspinding elements of input jpacks into a jdict. |
|
132 |
|
|
133 |
Examples: |
|
134 |
|
|
135 |
>>> sql('''select jzipdict('["a", "b"]', '[1,2]','[4,5]')''') |
|
136 |
jzipdict('["a", "b"]', '[1,2]','[4,5]') |
|
137 |
--------------------------------------- |
|
138 |
{"a":[1,4],"b":[2,5]} |
|
139 |
|
|
140 |
""" |
|
141 |
return json.dumps(dict(tuple([x[0], x[1:]]) for x in zip(*jopts.elemfromj(*args))), separators=(',',':'), ensure_ascii=False) |
|
142 |
|
|
143 |
jzipdict.registered=True |
|
144 |
|
|
97 | 145 |
def jlen(*args): |
98 | 146 |
|
99 | 147 |
""" |
... | ... | |
359 | 407 |
|
360 | 408 |
jmerge.registered=True |
361 | 409 |
|
410 |
|
|
362 | 411 |
def jset(*args): |
363 | 412 |
""" |
364 | 413 |
.. function:: jset(jpacks) -> jpack |
... | ... | |
374 | 423 |
|
375 | 424 |
""" |
376 | 425 |
|
377 |
return jopts.toj(sorted(set( jopts.fromj(*args) )))
|
|
426 |
return jopts.toj(sorted(set(jopts.fromj(*args))))
|
|
378 | 427 |
|
379 |
jset.registered=True
|
|
428 |
jset.registered = True
|
|
380 | 429 |
|
430 |
|
|
381 | 431 |
def jexcept(*args): |
382 | 432 |
""" |
383 | 433 |
.. function:: jexcept(jpackA, jpackB) -> jpack |
... | ... | |
404 | 454 |
b = set(jopts.fromj(args[1])) |
405 | 455 |
return jopts.toj([x for x in jopts.fromj(args[0]) if x not in b]) |
406 | 456 |
|
407 |
jexcept.registered=True
|
|
457 |
jexcept.registered = True
|
|
408 | 458 |
|
409 | 459 |
|
460 |
def jintersection(*args): |
|
461 |
""" |
|
462 |
.. function:: jintersection(jpackA, jpackB) -> jpack |
|
463 |
|
|
464 |
Returns the items of jpackA except the items that appear on jpackB. |
|
465 |
|
|
466 |
Examples: |
|
467 |
|
|
468 |
>>> sql("select jintersection('[1,2,3]', '[1,2,3]')") # doctest: +NORMALIZE_WHITESPACE |
|
469 |
jintersection('[1,2,3]', '[1,2,3]') |
|
470 |
----------------------------------- |
|
471 |
[1,2,3] |
|
472 |
|
|
473 |
>>> sql("select jintersection('[1,2,3]', '[1,3]', 1)") # doctest: +NORMALIZE_WHITESPACE |
|
474 |
jintersection('[1,2,3]', '[1,3]', 1) |
|
475 |
------------------------------------ |
|
476 |
1 |
|
477 |
|
|
478 |
""" |
|
479 |
|
|
480 |
if len(args) < 2: |
|
481 |
raise functions.OperatorError("jintersection","operator needs at least two inputs") |
|
482 |
|
|
483 |
return jopts.toj(sorted(set.intersection(*[set(jopts.fromj(x)) for x in args]))) |
|
484 |
|
|
485 |
jintersection.registered = True |
|
486 |
|
|
487 |
|
|
410 | 488 |
def jsort(*args): |
411 | 489 |
|
412 | 490 |
""" |
... | ... | |
746 | 824 |
def jdictsplit(*args): |
747 | 825 |
|
748 | 826 |
""" |
749 |
.. function:: jdictvals(jdict, [key1, key2,..]) -> columns
|
|
827 |
.. function:: jdictsplit(jdict, [key1, key2,..]) -> columns
|
|
750 | 828 |
|
751 | 829 |
If only the first argument (jdict) is provided, it returns a row containing the values of input jdict (sorted by the jdict keys). |
752 | 830 |
|
... | ... | |
785 | 863 |
|
786 | 864 |
jdictsplit.registered=True |
787 | 865 |
|
866 |
|
|
867 |
def jdictsplitv(*args): |
|
868 |
|
|
869 |
""" |
|
870 |
.. function:: jdictsplitv(jdict, [key1, key2,..]) -> columns |
|
871 |
|
|
872 |
If only the first argument (jdict) is provided, it returns rows containing the values of input jdict. |
|
873 |
|
|
874 |
If key values are also provided, it returns only the columns of which the keys have been provided. |
|
875 |
|
|
876 |
Examples: |
|
877 |
|
|
878 |
>>> sql(''' select jdictsplitv('{"k1":1,"k2":2}') ''') # doctest: +NORMALIZE_WHITESPACE |
|
879 |
key | val |
|
880 |
--------- |
|
881 |
k1 | 1 |
|
882 |
k2 | 2 |
|
883 |
|
|
884 |
>>> sql(''' select jdictsplitv('{"k1":1,"k2":2, "k3":3}', 'k3', 'k1', 'k4') ''') # doctest: +NORMALIZE_WHITESPACE |
|
885 |
key | val |
|
886 |
--------- |
|
887 |
k3 | 3 |
|
888 |
k1 | 1 |
|
889 |
|
|
890 |
""" |
|
891 |
|
|
892 |
yield ('key', 'val') |
|
893 |
if len(args) == 1: |
|
894 |
dlist = json.loads(args[0], object_pairs_hook=OrderedDict) |
|
895 |
for k, v in dlist.iteritems(): |
|
896 |
yield [k, jopts.toj(v)] |
|
897 |
else: |
|
898 |
dlist = json.loads(args[0]) |
|
899 |
for k in args[1:]: |
|
900 |
try: |
|
901 |
yield k, jopts.toj(dlist[k]) |
|
902 |
except KeyError: |
|
903 |
pass |
|
904 |
|
|
905 |
jdictsplitv.registered = True |
|
906 |
|
|
788 | 907 |
def jdictgroupkey(*args): |
789 | 908 |
""" |
790 | 909 |
.. function:: jdictgroupkey(list_of_jdicts, groupkey1, groupkey2, ...) |
Also available in: Unified diff
Madis update