1
|
# coding: utf-8
|
2
|
import setpath
|
3
|
import re
|
4
|
import functions
|
5
|
import unicodedata
|
6
|
import hashlib
|
7
|
import zlib
|
8
|
import itertools
|
9
|
from collections import deque
|
10
|
from lib import jopts
|
11
|
|
12
|
# Increase regular expression cache
|
13
|
try:
|
14
|
re._MAXCACHE = 1000
|
15
|
except:
|
16
|
pass
|
17
|
|
18
|
# Every regular expression containing \W \w \D \d \b \S \s needs to be compiled
|
19
|
# like below. If you want to embed the UNICODE directive inside the
|
20
|
# regular expression use:
|
21
|
# (?u) like re.sub(ur'(?u)[\W\d]', ' ', o)
|
22
|
delete_numbers_and_non_letters=re.compile(ur'[\W]',re.UNICODE)
|
23
|
delete_non_letters=re.compile(ur'[\W]',re.UNICODE)
|
24
|
delete_word_all=re.compile(ur'\w+\sall',re.UNICODE)
|
25
|
delete_word_all_and_or=re.compile(ur'\w+\sall\s(?:and|or)',re.UNICODE)
|
26
|
text_tokens = re.compile(ur'([\d.]+\b|\w+|\$[\d.]+)', re.UNICODE)
|
27
|
strip_remove_newlines=re.compile(u'(?:\\s+$|^\\s+|(?<=[^\\s\\d\\w.;,!?])\n+)', re.UNICODE)
|
28
|
reduce_spaces=re.compile(ur'\s+', re.UNICODE)
|
29
|
cqlterms=('title', 'subject', 'person', 'enter', 'creator', 'isbn')
|
30
|
|
31
|
def keywords(*args):
|
32
|
|
33
|
"""
|
34
|
.. function:: keywords(text1, [text2,...]) -> text
|
35
|
|
36
|
Returns the keywords inside a single column (text1) or aggregated
|
37
|
multiple columns.
|
38
|
|
39
|
Examples:
|
40
|
|
41
|
>>> table1('''
|
42
|
... first(second) third+fourth
|
43
|
... πρωτο(δευτερο) τριτο+τέταρτο
|
44
|
... 'πέμπτο all' 'qwer.zxcv'
|
45
|
... ''')
|
46
|
>>> sql("select keywords(a,b) from table1")
|
47
|
keywords(a,b)
|
48
|
---------------------------------------------------
|
49
|
first second third fourth
|
50
|
πρωτο δευτερο τριτο τέταρτο
|
51
|
πέμπτο all qwer zxcv
|
52
|
"""
|
53
|
|
54
|
out=text_tokens.findall(args[0])
|
55
|
for i in args[1:]:
|
56
|
out+=text_tokens.findall(i)
|
57
|
|
58
|
return ' '.join((x for x in out if x != '.'))
|
59
|
|
60
|
keywords.registered=True
|
61
|
|
62
|
|
63
|
def cqlkeywords(*args):
|
64
|
|
65
|
"""
|
66
|
.. function:: cqlkeywords(text1, [text2,...]) -> text
|
67
|
|
68
|
Returns the keywords inside a single column (text1) or aggregated
|
69
|
from multiple columns.
|
70
|
|
71
|
The difference of cqlkeywords to keywords is that cqlkeywords also
|
72
|
strips cql syntax like "title all" or "author all" and plain cql directives
|
73
|
like 'creator', 'title'...
|
74
|
|
75
|
Examples:
|
76
|
|
77
|
>>> table1('''
|
78
|
... first(second) third+fourth
|
79
|
... πρωτο(δευτερο) τριτο_τέταρτο
|
80
|
... 'πέμπτο all' 'έκτο title all τεστ'
|
81
|
... 'title all and something' 'other'
|
82
|
... 'title and something' 'other'
|
83
|
... ''')
|
84
|
>>> sql("select cqlkeywords(a,b) from table1")
|
85
|
cqlkeywords(a,b)
|
86
|
---------------------------------------------------
|
87
|
first second third fourth
|
88
|
πρωτο δευτερο τριτο_τέταρτο
|
89
|
έκτο τεστ
|
90
|
something other
|
91
|
and something other
|
92
|
"""
|
93
|
|
94
|
out=[]
|
95
|
for i in args:
|
96
|
o=i.lower()
|
97
|
o=delete_non_letters.sub(' ',o)
|
98
|
o=delete_word_all_and_or.sub('',o)
|
99
|
o=delete_word_all.sub('',o)
|
100
|
o=reduce_spaces.sub(' ',o)
|
101
|
o=o.strip()
|
102
|
o=o.split(' ')
|
103
|
|
104
|
for k in o:
|
105
|
if len(k)>0 and k not in cqlterms:
|
106
|
out.append(k)
|
107
|
|
108
|
return ' '.join(out)
|
109
|
|
110
|
cqlkeywords.registered=True
|
111
|
|
112
|
|
113
|
def kwnum(*args):
|
114
|
|
115
|
"""
|
116
|
.. function:: kwnum(text1, [text2,...]) -> int
|
117
|
|
118
|
Returns the number of simple keywords in a string.
|
119
|
Its input should be words separated by spaces, as returned by
|
120
|
cqlkeywords or keywords.
|
121
|
|
122
|
Examples:
|
123
|
|
124
|
>>> table1('''
|
125
|
... 'word1 word2 word3'
|
126
|
... 'word1 word2'
|
127
|
... 'word'
|
128
|
... ''')
|
129
|
>>> sql("select kwnum(a) from table1")
|
130
|
kwnum(a)
|
131
|
--------
|
132
|
3
|
133
|
2
|
134
|
1
|
135
|
"""
|
136
|
|
137
|
o=0
|
138
|
for i in args:
|
139
|
o+=len(i.split(' '))
|
140
|
|
141
|
return o
|
142
|
|
143
|
kwnum.registered=True
|
144
|
|
145
|
def uniqueterms(*args):
|
146
|
"""
|
147
|
.. function:: uniqueterms(text1, [text2,...]) -> text
|
148
|
|
149
|
Returns the unique terms of an input string.
|
150
|
|
151
|
Examples:
|
152
|
|
153
|
>>> table1('''
|
154
|
... 'word1 word2 word2'
|
155
|
... 'word1 word2 word1'
|
156
|
... 'word'
|
157
|
... ''')
|
158
|
>>> sql("select uniqueterms(a) from table1")
|
159
|
uniqueterms(a)
|
160
|
--------------
|
161
|
word1 word2
|
162
|
word1 word2
|
163
|
word
|
164
|
"""
|
165
|
|
166
|
o=set()
|
167
|
l=[]
|
168
|
for i in args:
|
169
|
for t in i.split(' '):
|
170
|
if t not in o and not t=='':
|
171
|
o.add(t)
|
172
|
l.append(t)
|
173
|
|
174
|
return ' '.join(l)
|
175
|
|
176
|
uniqueterms.registered=True
|
177
|
|
178
|
|
179
|
match_field_all=re.compile('(title|isbn|issn|subject|creator|language|type)\sall',re.UNICODE)
|
180
|
|
181
|
def cqlfields(*args):
|
182
|
|
183
|
"""
|
184
|
This functions returns the keywords inside a single column or aggregated
|
185
|
from multiple columns. It plays well with Unicode.
|
186
|
|
187
|
The difference of cqlkeywords to keywords is that cqlkeywords also
|
188
|
strips cql syntax like "title all" or "author all".
|
189
|
|
190
|
>>> table1('''
|
191
|
... '(title all "scrieri") and (creator all "arghezi") and (title all "other")'
|
192
|
... '("maschinenschreiben") and (language all "ger")'
|
193
|
... '("sauer") and ("übungsbuch")'
|
194
|
... ''')
|
195
|
>>> sql("select cqlfields(a) from table1")
|
196
|
cqlfields(a)
|
197
|
-------------------
|
198
|
title creator title
|
199
|
language
|
200
|
<BLANKLINE>
|
201
|
"""
|
202
|
|
203
|
out=[]
|
204
|
for i in args:
|
205
|
o=i.lower()
|
206
|
o=delete_numbers_and_non_letters.sub(' ',o)
|
207
|
fields=match_field_all.findall(o)
|
208
|
|
209
|
for k in fields:
|
210
|
out.append(k)
|
211
|
return ' '.join(out)
|
212
|
|
213
|
|
214
|
cqlfields.registered=True
|
215
|
|
216
|
def comprspaces(*args):
|
217
|
"""
|
218
|
.. function:: comprspaces(text1, [text2,...]) -> text
|
219
|
|
220
|
This function strips (from the beginning and the end) and compresses
|
221
|
the spaces in its input.
|
222
|
|
223
|
Examples:
|
224
|
|
225
|
>>> table1('''
|
226
|
... ' an example with spaces ' 'another example with spaces '
|
227
|
... ''')
|
228
|
>>> sql("select comprspaces(a,b) from table1")
|
229
|
comprspaces(a,b)
|
230
|
--------------------------------------------------
|
231
|
an example with spaces another example with spaces
|
232
|
"""
|
233
|
|
234
|
if len(args) == 1:
|
235
|
return reduce_spaces.sub(' ', strip_remove_newlines.sub('', args[0]))
|
236
|
|
237
|
out=[]
|
238
|
for i in args:
|
239
|
o=reduce_spaces.sub(' ', strip_remove_newlines.sub('', i))
|
240
|
out+=[o]
|
241
|
|
242
|
return ' '.join(out)
|
243
|
|
244
|
comprspaces.registered=True
|
245
|
|
246
|
reduce_special_characters=re.compile(ur'(?:[\s\n,.;]+|[^\w,.\s]+)',re.UNICODE)
|
247
|
reduce_underscore = re.compile(ur'(\b_+\b)',re.UNICODE)
|
248
|
|
249
|
def normreplace(a):
|
250
|
if (a.group()[0] in ' \t\n.,;'):
|
251
|
return ' '
|
252
|
|
253
|
return '_';
|
254
|
|
255
|
def normalizetext(*args):
|
256
|
"""
|
257
|
.. function:: normalizetext(text1, [text2,...]) -> text
|
258
|
|
259
|
Normalizes a text by replacing all the non-words except \s\n,.; with '_'
|
260
|
|
261
|
Examples:
|
262
|
|
263
|
>>> table1('''
|
264
|
... first(second) third+fourth
|
265
|
... πρωτο(δευτερο) τριτο+τέταρτο
|
266
|
... 'πέμπτο all' 'έκτο title all τεστ'
|
267
|
... ''')
|
268
|
>>> sql("select normalizetext(a,b) from table1")
|
269
|
normalizetext(a,b)
|
270
|
----------------------------------------------------
|
271
|
first_second_ third_fourth
|
272
|
πρωτο_δευτερο_ τριτο_τέταρτο
|
273
|
πέμπτο all έκτο title all τεστ
|
274
|
"""
|
275
|
out=[]
|
276
|
for o in args:
|
277
|
o=reduce_special_characters.sub(normreplace,o)
|
278
|
o=reduce_underscore.sub(' ',o)
|
279
|
out.append(reduce_spaces.sub(' ', o).strip())
|
280
|
|
281
|
return ' '.join(out)
|
282
|
|
283
|
normalizetext.registered=True
|
284
|
|
285
|
|
286
|
query_regular_characters=re.compile(ur"""^[·∆©(́−·¨¬…‐"•΄€„”“‘’´«»’ʹ–\w\s\[!-~\]]*$""", re.UNICODE)
|
287
|
|
288
|
def isvalidutf8(*args):
|
289
|
|
290
|
"""
|
291
|
.. function:: isvalidutf8(text) -> 1/0
|
292
|
|
293
|
Returns 1 if the input text is in valid UTF-8 format, or 0 if not.
|
294
|
This function is used to find corrupted UTF-8 strings with a heuristic
|
295
|
based on non common characters.
|
296
|
|
297
|
Examples:
|
298
|
|
299
|
>>> table1('''
|
300
|
... test
|
301
|
... δοκιμή!
|
302
|
... sévignÃ
|
303
|
... évezred
|
304
|
... ''')
|
305
|
>>> sql("select isvalidutf8(a) from table1")
|
306
|
isvalidutf8(a)
|
307
|
--------------
|
308
|
1
|
309
|
1
|
310
|
1
|
311
|
1
|
312
|
"""
|
313
|
|
314
|
for i in args:
|
315
|
if i==None:
|
316
|
return 0
|
317
|
if not query_regular_characters.match(i):
|
318
|
return 0
|
319
|
|
320
|
return 1
|
321
|
|
322
|
isvalidutf8.registered=True
|
323
|
|
324
|
|
325
|
characters_to_clean=re.compile(ur"""[^\w!-~]""", re.UNICODE)
|
326
|
|
327
|
def utf8clean(*args):
|
328
|
|
329
|
"""
|
330
|
.. function:: utf8clean(text) -> text
|
331
|
|
332
|
Removes control characters from input utf-8 text.
|
333
|
|
334
|
Examples:
|
335
|
|
336
|
>>> table1('''
|
337
|
... test
|
338
|
... δοκιμή!
|
339
|
... sévignÃ
|
340
|
... évezred
|
341
|
... ''')
|
342
|
>>> sql("select utf8clean(a) from table1")
|
343
|
utf8clean(a)
|
344
|
-------------
|
345
|
test
|
346
|
δοκιμή!
|
347
|
sévignÃ
|
348
|
évezred
|
349
|
"""
|
350
|
|
351
|
def cleanchar(c):
|
352
|
c=c.group()[0]
|
353
|
if c != '\n' and unicodedata.category(c)[0] == 'C':
|
354
|
return u''
|
355
|
else:
|
356
|
return c
|
357
|
|
358
|
o=''
|
359
|
for i in args:
|
360
|
if type(i) in (str,unicode):
|
361
|
o+=characters_to_clean.sub(cleanchar, i)
|
362
|
else:
|
363
|
o+=unicode(i, errors='replace')
|
364
|
|
365
|
return o
|
366
|
|
367
|
utf8clean.registered=True
|
368
|
|
369
|
def regexpr(*args):
|
370
|
|
371
|
"""
|
372
|
.. function:: regexp(pattern,expression[,replacestr])
|
373
|
|
374
|
This function returns a match to the first parenthesis of *pattern*
|
375
|
or replaces the matches of *pattern* in *expression* with *replacestr*.
|
376
|
`Pattern Syntax <http://docs.python.org/library/re.html#re-syntax>`_ is
|
377
|
according to python's re module.
|
378
|
|
379
|
Examples use `inversion`.
|
380
|
|
381
|
Examples:
|
382
|
|
383
|
>>> table1('''
|
384
|
... 25
|
385
|
... ''')
|
386
|
|
387
|
>>> sql("regexpr 'start\s(\w+)\send' 'start otherword end' ")
|
388
|
regexpr('start\s(\w+)\send','start otherword end')
|
389
|
--------------------------------------------------
|
390
|
otherword
|
391
|
|
392
|
>>> sql("regexpr '\W+' '@#$%@$#% tobereplaced @#$%@#$%' 'nonword' ")
|
393
|
regexpr('\W+','@#$%@$#% tobereplaced @#$%@#$%','nonword')
|
394
|
---------------------------------------------------------
|
395
|
nonwordtobereplacednonword
|
396
|
|
397
|
>>> sql("select regexpr('(\w+).*?(\w+)', 'one two three')")
|
398
|
regexpr('(\w+).*?(\w+)', 'one two three')
|
399
|
-----------------------------------------
|
400
|
["one","two"]
|
401
|
"""
|
402
|
if len(args)<2:
|
403
|
return
|
404
|
|
405
|
if len(args)==2:
|
406
|
a=re.search(args[0], unicode(args[1]),re.UNICODE)
|
407
|
if a!=None:
|
408
|
if len(a.groups())>0:
|
409
|
return jopts.toj(a.groups())
|
410
|
else:
|
411
|
return True
|
412
|
else:
|
413
|
return None
|
414
|
|
415
|
if len(args) == 3:
|
416
|
try:
|
417
|
return re.sub(args[0], args[2], args[1], flags=re.UNICODE)
|
418
|
except TypeError:
|
419
|
return re.sub(args[0], args[2], args[1])
|
420
|
|
421
|
regexpr.registered = True
|
422
|
|
423
|
def regexprfindall(*args):
|
424
|
"""
|
425
|
.. function:: regexprfindall(pattern,text)
|
426
|
|
427
|
This function returns *all* matches of *pattern* in text.
|
428
|
|
429
|
Examples:
|
430
|
|
431
|
>>> sql("select regexprfindall('\w+', 'one')")
|
432
|
regexprfindall('\w+', 'one')
|
433
|
----------------------------
|
434
|
["one"]
|
435
|
|
436
|
>>> sql("select regexprfindall('\w+', 'one two three')")
|
437
|
regexprfindall('\w+', 'one two three')
|
438
|
--------------------------------------
|
439
|
["one","two","three"]
|
440
|
"""
|
441
|
|
442
|
if len(args)!=2:
|
443
|
raise functions.OperatorError('regexprfindall', 'Two parameters should be provided')
|
444
|
|
445
|
return jopts.tojstrict(re.findall(args[0], unicode(args[1]),re.UNICODE))
|
446
|
|
447
|
regexprfindall.registered=True
|
448
|
|
449
|
def regexprmatches(*args):
|
450
|
|
451
|
"""
|
452
|
.. function:: regexprmatches(pattern, arg)
|
453
|
|
454
|
This function returns true if the pattern matches arg or false otherwise.
|
455
|
|
456
|
Examples use `inversion`.
|
457
|
|
458
|
Examples:
|
459
|
|
460
|
>>> sql("regexprmatches '(a)' 'qwer a qwer' ")
|
461
|
regexprmatches('(a)','qwer a qwer')
|
462
|
-----------------------------------
|
463
|
1
|
464
|
|
465
|
"""
|
466
|
if len(args)!=2:
|
467
|
raise functions.OperatorError('regexprmatches', 'Two parameters should be provided')
|
468
|
|
469
|
a=re.search(args[0], unicode(args[1]),re.UNICODE)
|
470
|
if a!=None:
|
471
|
return True
|
472
|
else:
|
473
|
return False
|
474
|
|
475
|
regexprmatches.registered=True
|
476
|
|
477
|
|
478
|
def regexpcountwithpositions(pattern,expression,start = 0,min = 0.5,multiply = 1):
|
479
|
"""
|
480
|
.. function:: regexpcountwithpositions(pattern, expression,start = 0,min = 0.5,multiply = 1,)
|
481
|
|
482
|
Returns a score of positioned matches of pattern in expression.
|
483
|
|
484
|
Examples:
|
485
|
|
486
|
>>> sql("regexpcountwithpositions 'start' 'start end start' ")
|
487
|
regexpcountwithpositions('start','start end start')
|
488
|
---------------------------------------------------
|
489
|
1.75
|
490
|
|
491
|
>>> sql("regexpcountwithpositions 'start' 'start end start' ")
|
492
|
regexpcountwithpositions('start','start end start')
|
493
|
---------------------------------------------------
|
494
|
1.75
|
495
|
|
496
|
>>> sql("regexpcountwithpositions 'first' 'first second third fourth'")
|
497
|
regexpcountwithpositions('first','first second third fourth')
|
498
|
-------------------------------------------------------------
|
499
|
0.75
|
500
|
|
501
|
>>> sql("regexpcountwithpositions 'fourth' 'first second third fourth'")
|
502
|
regexpcountwithpositions('fourth','first second third fourth')
|
503
|
--------------------------------------------------------------
|
504
|
1.5
|
505
|
|
506
|
>>> sql("regexpcountwithpositions 'fourth' 'first second third fourth' 1")
|
507
|
regexpcountwithpositions('fourth','first second third fourth','1')
|
508
|
------------------------------------------------------------------
|
509
|
0.5
|
510
|
"""
|
511
|
|
512
|
count = 0
|
513
|
if start == 0:
|
514
|
total = 0
|
515
|
for i in re.finditer(pattern+'|(\s)',expression,re.UNICODE):
|
516
|
count += 1
|
517
|
if i.group()!=' ':
|
518
|
total += count * multiply
|
519
|
if total == 0:
|
520
|
return 0.0
|
521
|
else:
|
522
|
if count == 0 :
|
523
|
return min
|
524
|
return min + total / float(count)
|
525
|
else:
|
526
|
matches = []
|
527
|
total = 0
|
528
|
for i in re.finditer(pattern+'|(\s)',expression,re.UNICODE):
|
529
|
count += 1
|
530
|
if i.group()!=' ':
|
531
|
matches.append(count)
|
532
|
total += count * multiply
|
533
|
if total == 0:
|
534
|
return 0.0
|
535
|
else:
|
536
|
if count == 0:
|
537
|
return min
|
538
|
return min + sum(count - i for i in matches) / float(count)
|
539
|
|
540
|
regexpcountwithpositions.registered=True
|
541
|
|
542
|
|
543
|
def regexpcountuniquematches(*args):
|
544
|
"""
|
545
|
.. function:: regexpcountuniquematches(pattern, expression)
|
546
|
|
547
|
Returns the number of matches of pattern in expression.
|
548
|
|
549
|
Examples:
|
550
|
|
551
|
|
552
|
|
553
|
>>> sql("regexpcountuniquematches 'start' 'start end start' ")
|
554
|
regexpcountuniquematches('start','start end start')
|
555
|
---------------------------------------------------
|
556
|
1
|
557
|
|
558
|
>>> sql("regexpcountuniquematches 'start end' 'start end start' ")
|
559
|
regexpcountuniquematches('start end','start end start')
|
560
|
-------------------------------------------------------
|
561
|
1
|
562
|
|
563
|
"""
|
564
|
|
565
|
return len(set(re.findall(args[0], unicode(args[1]), re.UNICODE)))
|
566
|
|
567
|
regexpcountuniquematches.registered=True
|
568
|
|
569
|
|
570
|
def regexpcountwords(*args):
|
571
|
"""
|
572
|
.. function:: regexpcountwords(pattern, expression)
|
573
|
|
574
|
Returns the number of matches of pattern in expression. If a match includes more than one words then it returns the number of the words.
|
575
|
|
576
|
Examples:
|
577
|
|
578
|
>>> sql("regexpcountwords 'start' 'start end start' ")
|
579
|
regexpcountwords('start','start end start')
|
580
|
-------------------------------------------
|
581
|
2
|
582
|
|
583
|
>>> sql("regexpcountwords 'start end' 'start end start' ")
|
584
|
regexpcountwords('start end','start end start')
|
585
|
-----------------------------------------------
|
586
|
2
|
587
|
"""
|
588
|
|
589
|
return sum(((i.group().strip().count(' ')+1) for i in re.finditer(args[0],unicode(args[1]),re.UNICODE) ))
|
590
|
|
591
|
regexpcountwords.registered=True
|
592
|
|
593
|
|
594
|
def contains(*args):
|
595
|
"""
|
596
|
.. function:: contains(str1,str2) -> bool
|
597
|
|
598
|
Returns true if string *str1* contains *str2*.
|
599
|
|
600
|
Examples:
|
601
|
|
602
|
>>> sql("select contains('test string', 'str') as test ")
|
603
|
test
|
604
|
----
|
605
|
1
|
606
|
>>> sql("select contains('test string', 'nostr') as test ")
|
607
|
test
|
608
|
----
|
609
|
0
|
610
|
"""
|
611
|
if len(args)!=2:
|
612
|
raise functions.OperatorError("included","operator takes exactly two arguments")
|
613
|
if (args[1] in args[0]):
|
614
|
return True
|
615
|
return False
|
616
|
|
617
|
contains.registered=True
|
618
|
|
619
|
|
620
|
def unitosuni(*args):
|
621
|
"""
|
622
|
.. function:: unitosuni(str)
|
623
|
|
624
|
Returns *str* replacing non-ascii characters with their equivalent
|
625
|
unicode code point literal at the \\u00 format.
|
626
|
|
627
|
Examples:
|
628
|
|
629
|
>>> sql("select unitosuni('brûlé') as test ")
|
630
|
test
|
631
|
---------------
|
632
|
br\\u00fbl\\u00e9
|
633
|
>>> sql("select sunitouni(null)")
|
634
|
sunitouni(null)
|
635
|
---------------
|
636
|
None
|
637
|
>>> sql("select unitosuni(9)")
|
638
|
unitosuni(9)
|
639
|
------------
|
640
|
9
|
641
|
"""
|
642
|
if len(args)!=1:
|
643
|
raise functions.OperatorError("unitosuni","operator takes only one arguments")
|
644
|
if args[0]==None:
|
645
|
return None
|
646
|
try:
|
647
|
return repr(unicode(args[0])).replace('\\x','\\u00')[2:-1]
|
648
|
except KeyboardInterrupt:
|
649
|
raise
|
650
|
except Exception:
|
651
|
return args[0]
|
652
|
|
653
|
unitosuni.registered=True
|
654
|
|
655
|
|
656
|
def sunitouni(*args):
|
657
|
"""
|
658
|
.. function:: sunitouni(str)
|
659
|
|
660
|
Returns *str* replacing literal unicode code points to their string representation.
|
661
|
|
662
|
Examples:
|
663
|
|
664
|
>>> sql("select sunitouni('br\\u00fbl\\u00e9') as test ")
|
665
|
test
|
666
|
-------
|
667
|
brûlé
|
668
|
>>> sql("select sunitouni('\\u that is not a unicode code point') as test ")
|
669
|
test
|
670
|
-----------------------------------
|
671
|
\u that is not a unicode code point
|
672
|
>>> sql("select sunitouni(null)")
|
673
|
sunitouni(null)
|
674
|
---------------
|
675
|
None
|
676
|
>>> sql("select sunitouni(9)")
|
677
|
sunitouni(9)
|
678
|
------------
|
679
|
9
|
680
|
"""
|
681
|
if len(args)!=1:
|
682
|
raise functions.OperatorError("sunitouni","operator takes only one arguments")
|
683
|
if args[0]==None:
|
684
|
return None
|
685
|
kk="u'%s'" %(unicode(args[0]).replace("'","\\'"))
|
686
|
try:
|
687
|
return eval(kk)
|
688
|
except KeyboardInterrupt:
|
689
|
raise
|
690
|
except Exception:
|
691
|
return args[0]
|
692
|
|
693
|
sunitouni.registered=True
|
694
|
|
695
|
|
696
|
def stripchars(*args):
|
697
|
"""
|
698
|
.. function:: stripchars(str[,stripchars])
|
699
|
|
700
|
Returns *str* removing leading and trailing whitespace characters
|
701
|
or *stripchars* characters if given. Works like python's
|
702
|
`strip function <http://docs.python.org/library/stdtypes.html#str.strip>`_.
|
703
|
|
704
|
|
705
|
Examples:
|
706
|
|
707
|
>>> sql("select stripchars(' initial and final spaces ') as test ")
|
708
|
test
|
709
|
------------------------
|
710
|
initial and final spaces
|
711
|
>>> sql("select stripchars(' <initial and final spaces> ',' <>') as test ")
|
712
|
test
|
713
|
------------------------
|
714
|
initial and final spaces
|
715
|
>>> sql("select stripchars(null)")
|
716
|
stripchars(null)
|
717
|
----------------
|
718
|
None
|
719
|
"""
|
720
|
if len(args)<1:
|
721
|
raise functions.OperatorError("stripchars","operator takes at least one arguments")
|
722
|
if args[0]==None:
|
723
|
return None
|
724
|
if len(args)<2:
|
725
|
return unicode(args[0]).strip()
|
726
|
return unicode(args[0]).strip(args[1])
|
727
|
stripchars.registered=True
|
728
|
|
729
|
|
730
|
def reencode(*args):
|
731
|
if len(args)!=1:
|
732
|
raise functions.OperatorError("reencode","operator takes only one arguments")
|
733
|
|
734
|
us=args[0]
|
735
|
if us==None:
|
736
|
return None
|
737
|
us=unicode(us)
|
738
|
try:
|
739
|
a=unicode(us.encode('iso-8859-1'),'utf-8')
|
740
|
return a
|
741
|
except KeyboardInterrupt:
|
742
|
raise
|
743
|
except Exception:
|
744
|
try:
|
745
|
a=unicode(us.encode('windows-1252'),'utf-8')
|
746
|
return a
|
747
|
except Exception:
|
748
|
return us
|
749
|
|
750
|
reencode.registered=False
|
751
|
|
752
|
|
753
|
def normuni(*args):
|
754
|
"""
|
755
|
.. function:: normuni(str)
|
756
|
|
757
|
Returns *str* normalised in the composed unicode normal form without replacing
|
758
|
same look characters. For example this 'À' character can be encoded with one or two
|
759
|
different characters, :func:`normuni` returns an one-character encoded version. This
|
760
|
function is important to check true strings equality.
|
761
|
|
762
|
Functions :func:`sunitouni` and :func:`unitosuni` are used in the examples to make it more comprehensive.
|
763
|
|
764
|
Examples:
|
765
|
|
766
|
.. note::
|
767
|
Returned results in the next two examples should look the same,
|
768
|
if not that is a bug at the combined characters rendering of the shell
|
769
|
that the documentation was created.
|
770
|
|
771
|
>>> sql("select sunitouni('C\u0327') as test ")
|
772
|
test
|
773
|
----
|
774
|
Ç
|
775
|
>>> sql("select normuni(sunitouni('C\u0327')) as test ")
|
776
|
test
|
777
|
----
|
778
|
Ç
|
779
|
>>> sql("select unitosuni(normuni(sunitouni('C\u0327'))) as test ")
|
780
|
test
|
781
|
------
|
782
|
\u00c7
|
783
|
"""
|
784
|
if len(args)!=1:
|
785
|
raise functions.OperatorError("normuni","operator takes only one arguments")
|
786
|
if args[0]==None:
|
787
|
return None
|
788
|
return unicodedata.normalize('NFC', args[0])
|
789
|
|
790
|
normuni.registered=True
|
791
|
|
792
|
|
793
|
def hashmd5(*args):
|
794
|
"""
|
795
|
.. function:: hashmd5(args)
|
796
|
|
797
|
Returns an MD5 hash of args. Numbers are converted to text before hashing is
|
798
|
performed.
|
799
|
|
800
|
Examples:
|
801
|
|
802
|
>>> sql("select hashmd5(65)")
|
803
|
hashmd5(65)
|
804
|
--------------------------------
|
805
|
fc490ca45c00b1249bbe3554a4fdf6fb
|
806
|
>>> sql("select hashmd5(6,5)")
|
807
|
hashmd5(6,5)
|
808
|
--------------------------------
|
809
|
f0d95c20cde50e3ca03cab53f986b6c3
|
810
|
>>> sql("select hashmd5(5)")
|
811
|
hashmd5(5)
|
812
|
--------------------------------
|
813
|
e4da3b7fbbce2345d7772b0674a318d5
|
814
|
>>> sql("select hashmd5('5')")
|
815
|
hashmd5('5')
|
816
|
--------------------------------
|
817
|
7000aaf68ca7a93da0af3d03850571c2
|
818
|
"""
|
819
|
|
820
|
if len(args)==1:
|
821
|
return hashlib.md5(repr(args[0])).hexdigest()
|
822
|
else:
|
823
|
return hashlib.md5(chr(30).join([repr(x) for x in args])).hexdigest()
|
824
|
|
825
|
hashmd5.registered=True
|
826
|
|
827
|
|
828
|
def hashmd5mod(*args):
|
829
|
"""
|
830
|
.. function:: hashmd5mod(args, divisor) -> int
|
831
|
|
832
|
Returns the *modulo* with divisor number of the MD5 hash of args.
|
833
|
Numbers are converted to text before hashing is performed.
|
834
|
|
835
|
Examples:
|
836
|
|
837
|
>>> sql("select hashmd5mod(65, 3)")
|
838
|
hashmd5mod(65, 3)
|
839
|
-----------------
|
840
|
0
|
841
|
|
842
|
>>> sql("select hashmd5mod(6,5, 4)")
|
843
|
hashmd5mod(6,5, 4)
|
844
|
------------------
|
845
|
2
|
846
|
|
847
|
>>> sql("select hashmd5mod(5, 5)")
|
848
|
hashmd5mod(5, 5)
|
849
|
----------------
|
850
|
3
|
851
|
|
852
|
>>> sql("select hashmd5mod('5', 5)")
|
853
|
hashmd5mod('5', 5)
|
854
|
------------------
|
855
|
4
|
856
|
"""
|
857
|
|
858
|
if len(args)==2:
|
859
|
return int(hashlib.md5(repr(args[0])).hexdigest(),16) % args[-1]
|
860
|
else:
|
861
|
return int(hashlib.md5(chr(30).join([repr(x) for x in args])).hexdigest(),16) % args[-1]
|
862
|
|
863
|
hashmd5mod.registered=True
|
864
|
|
865
|
|
866
|
def crc32(*args):
|
867
|
"""
|
868
|
.. function:: crc32(args) -> int
|
869
|
|
870
|
Returns the CRC32 of args. Numbers are converted to text before hashing is
|
871
|
performed.
|
872
|
|
873
|
Examples:
|
874
|
|
875
|
>>> sql("select crc32(65)")
|
876
|
crc32(65)
|
877
|
----------
|
878
|
2658551721
|
879
|
|
880
|
>>> sql("select crc32(6,5)")
|
881
|
crc32(6,5)
|
882
|
----------
|
883
|
1565899724
|
884
|
|
885
|
>>> sql("select crc32(5)")
|
886
|
crc32(5)
|
887
|
----------
|
888
|
2226203566
|
889
|
|
890
|
>>> sql("select crc32('5')")
|
891
|
crc32('5')
|
892
|
----------
|
893
|
1201448970
|
894
|
"""
|
895
|
|
896
|
if len(args)==1:
|
897
|
return zlib.crc32(repr(args[0])) & 0xffffffff
|
898
|
else:
|
899
|
return zlib.crc32(chr(30).join([repr(x) for x in args])) & 0xffffffff
|
900
|
|
901
|
crc32.registered=True
|
902
|
|
903
|
def hashmodarchdep(*args):
|
904
|
"""
|
905
|
.. function:: hashmodarchdep(args, divisor) -> int
|
906
|
|
907
|
Returns a hash of the args.
|
908
|
|
909
|
.. note::
|
910
|
|
911
|
This hash function is architecture dependent (32bit vs 64bit).
|
912
|
|
913
|
Examples:
|
914
|
|
915
|
>>> sql("select hashmodarchdep(65,5)") #doctest:+ELLIPSIS
|
916
|
hashmodarchdep(65,5)
|
917
|
--------------------
|
918
|
...
|
919
|
|
920
|
>>> sql("select hashmodarchdep(6,5)") #doctest:+ELLIPSIS
|
921
|
hashmodarchdep(6,5)
|
922
|
-------------------
|
923
|
...
|
924
|
|
925
|
>>> sql("select hashmodarchdep(5,5)") #doctest:+ELLIPSIS
|
926
|
hashmodarchdep(5,5)
|
927
|
-------------------
|
928
|
...
|
929
|
|
930
|
>>> sql("select hashmodarchdep('5',5)") #doctest:+ELLIPSIS
|
931
|
hashmodarchdep('5',5)
|
932
|
---------------------
|
933
|
...
|
934
|
"""
|
935
|
|
936
|
return hash(tuple(args[:-1])) % args[-1]
|
937
|
|
938
|
hashmodarchdep.registered=True
|
939
|
|
940
|
|
941
|
def textreferences(txt,maxlen = 5,pattern = r'(\b|_)((1[5-9]\d{2,2})|(20\d{2,2}))(\b|_)' ):
|
942
|
"""
|
943
|
.. function:: textreferences(text, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
|
944
|
|
945
|
Returns the "Reference" section of documents. To find it, it searches for parts of the document that
|
946
|
have a high density of pattern matches.
|
947
|
|
948
|
.. parameters:: txt,maxlen,pattern
|
949
|
txt: input text.
|
950
|
maxlen: the size of the scrolling window over the text in which the density is calculated.
|
951
|
pattern: regular expression that is matched against the lines of the text. By default the pattern matches
|
952
|
year occurences so as to extract sections that look like references.
|
953
|
|
954
|
|
955
|
Examples:
|
956
|
|
957
|
>>> table1('''
|
958
|
... eeeeeeeeeeeeee
|
959
|
... gggggggggggggg
|
960
|
... aaaaaaaaaaaaaa
|
961
|
... bbbbbbbbbbbbbb
|
962
|
... aaa_1914_ccccc
|
963
|
... bbb_2014_bbbbb
|
964
|
... dddd_2008_ddddddd
|
965
|
... cccc_2005_ccccc
|
966
|
... ccccc_2014_ccccc
|
967
|
... dddddd_2009_ddddd
|
968
|
... gggggggggggggg
|
969
|
... ''')
|
970
|
|
971
|
>>> sql("select textreferences(group_concat(a,'\\n'),1,'(\b|_)(1|2)\d{3,3}(\b|_)') as a from table1")
|
972
|
a
|
973
|
--------------------------------------------------------------------------------------------------
|
974
|
aaa_1914_ccccc
|
975
|
bbb_2014_bbbbb
|
976
|
dddd_2008_ddddddd
|
977
|
cccc_2005_ccccc
|
978
|
ccccc_2014_ccccc
|
979
|
dddddd_2009_ddddd
|
980
|
|
981
|
|
982
|
If an inadequate amount of newlines is found, it returns the text as is.
|
983
|
|
984
|
>>> sql("select textreferences(group_concat(a,'.')) from table1")
|
985
|
textreferences(group_concat(a,'.'))
|
986
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
987
|
eeeeeeeeeeeeee.gggggggggggggg.aaaaaaaaaaaaaa.bbbbbbbbbbbbbb.aaa_1914_ccccc.bbb_2014_bbbbb.dddd_2008_ddddddd.cccc_2005_ccccc.ccccc_2014_ccccc.dddddd_2009_ddddd.gggggggggggggg
|
988
|
|
989
|
|
990
|
>>> sql("select textreferences('')")
|
991
|
textreferences('')
|
992
|
------------------
|
993
|
<BLANKLINE>
|
994
|
"""
|
995
|
|
996
|
exp = re.sub('\r\n','\n',txt)
|
997
|
|
998
|
if exp.count('\n')<10:
|
999
|
return exp
|
1000
|
references = []
|
1001
|
reversedtext = iter(reversed(exp.split('\n')))
|
1002
|
reversedtext2 = iter(reversed(exp.split('\n')))
|
1003
|
results = []
|
1004
|
densities = []
|
1005
|
winlen = 0
|
1006
|
|
1007
|
|
1008
|
for i in reversedtext:
|
1009
|
if len(i)>10:
|
1010
|
if re.search(pattern,i):
|
1011
|
results.append(1)
|
1012
|
else:
|
1013
|
results.append(0)
|
1014
|
tmpmax = 0
|
1015
|
maximum = 0
|
1016
|
win = deque(('' for _ in xrange(maxlen)),maxlen)
|
1017
|
for i in results:
|
1018
|
|
1019
|
if winlen<maxlen:
|
1020
|
winlen+=1
|
1021
|
win.append(i)
|
1022
|
tmpmax += i
|
1023
|
else:
|
1024
|
tmpmax -= win.popleft()
|
1025
|
tmpmax += i
|
1026
|
win.append(i)
|
1027
|
densities.append(float(tmpmax)/maxlen)
|
1028
|
if float(tmpmax)/maxlen>maximum:
|
1029
|
maximum = float(tmpmax)/maxlen
|
1030
|
#threshold = sorted(densities)[len(densities)/2]
|
1031
|
try:
|
1032
|
threshold = sum(densities)/len(densities)
|
1033
|
except:
|
1034
|
threshold = 0
|
1035
|
|
1036
|
current = 0
|
1037
|
for i in reversedtext2:
|
1038
|
if len(i)>10:
|
1039
|
if densities[current] >= threshold:
|
1040
|
references.append(i)
|
1041
|
current+=1
|
1042
|
return '\n'.join(reversed(references))
|
1043
|
|
1044
|
textreferences.registered=True
|
1045
|
|
1046
|
|
1047
|
def textwindow(*args):
|
1048
|
"""
|
1049
|
.. function:: textwindow(text, previous_word_count = 0, next_word_count = 0, middle_word_count = 1, pattern = None)
|
1050
|
|
1051
|
Returns a rolling window over the text. The window includes *previous_word_count* words before the middle word
|
1052
|
and next_word_count words after the middleword. Optionally you may choose more than one words to be in the middle, and filter your window with a regular expression pattern
|
1053
|
|
1054
|
If the value of prev_word_count or next_word_count is negative, and a pattern exists then all matches of the pattern are
|
1055
|
filtered out from prev and next output.
|
1056
|
|
1057
|
Examples:
|
1058
|
|
1059
|
>>> sql("select textwindow('This is a test phrase') ")
|
1060
|
middle
|
1061
|
------
|
1062
|
This
|
1063
|
is
|
1064
|
a
|
1065
|
test
|
1066
|
phrase
|
1067
|
|
1068
|
>>> sql("select textwindow('This is a test phrase',1,1) ")
|
1069
|
prev1 | middle | next1
|
1070
|
-----------------------
|
1071
|
| This | is
|
1072
|
This | is | a
|
1073
|
is | a | test
|
1074
|
a | test | phrase
|
1075
|
test | phrase |
|
1076
|
|
1077
|
>>> sql("select textwindow('This is a test phrase',1,1,2) ")
|
1078
|
prev1 | middle | next1
|
1079
|
----------------------------
|
1080
|
| This is | a
|
1081
|
This | is a | test
|
1082
|
is | a test | phrase
|
1083
|
a | test phrase |
|
1084
|
test | phrase |
|
1085
|
|
1086
|
>>> sql("select textwindow('This is a test phrase with pdb codes: 1abc 2bcd 3cde 4bde ',-2,1,2,'\d\w{3}' ) ")
|
1087
|
prev1 | prev2 | middle | next1
|
1088
|
----------------------------------
|
1089
|
pdb | codes: | 1abc 2bcd | 3cde
|
1090
|
pdb | codes: | 2bcd 3cde | 4bde
|
1091
|
pdb | codes: | 3cde 4bde |
|
1092
|
pdb | codes: | 4bde |
|
1093
|
|
1094
|
>>> sql("select textwindow('This is a test phrase (123) for filtering middle with a number',1,1,'\d+') ")
|
1095
|
prev1 | middle | next1
|
1096
|
-----------------------
|
1097
|
phrase | (123) | for
|
1098
|
|
1099
|
"""
|
1100
|
r = args[0]
|
1101
|
try:
|
1102
|
prev = args[1]
|
1103
|
except IndexError:
|
1104
|
prev = 0
|
1105
|
try:
|
1106
|
nextlen = args[2]
|
1107
|
except IndexError:
|
1108
|
nextlen = 0
|
1109
|
|
1110
|
middle = 1
|
1111
|
pattern = None
|
1112
|
try:
|
1113
|
if type(args[3]) == int:
|
1114
|
middle = args[3]
|
1115
|
else:
|
1116
|
pattern = args[3]
|
1117
|
except IndexError:
|
1118
|
pass
|
1119
|
|
1120
|
try :
|
1121
|
if type(args[4]) == int:
|
1122
|
middle = args[4]
|
1123
|
else:
|
1124
|
pattern = args[4]
|
1125
|
except IndexError:
|
1126
|
pass
|
1127
|
|
1128
|
if pattern == None:
|
1129
|
prev = abs(prev)
|
1130
|
|
1131
|
yield tuple(itertools.chain( ('prev'+str(x) for x in xrange(1,abs(prev)+1)),('middle',), ('next'+str(y) for y in xrange(1,nextlen + 1)) ))
|
1132
|
g = [''] * prev + r.split(' ') + [''] * ((middle-1)+nextlen)
|
1133
|
|
1134
|
if prev >= 0:
|
1135
|
window = prev + nextlen + middle
|
1136
|
pm = prev+middle
|
1137
|
im = prev
|
1138
|
if middle == 1:
|
1139
|
if pattern == None:
|
1140
|
for i in xrange(len(g)-window + 1):
|
1141
|
yield (g[i:i+window])
|
1142
|
else:
|
1143
|
patt = re.compile(pattern,re.UNICODE)
|
1144
|
for i in xrange(len(g)-window + 1):
|
1145
|
if patt.search(g[i+im]):
|
1146
|
yield (g[i:i+window])
|
1147
|
|
1148
|
else :
|
1149
|
if pattern == None:
|
1150
|
for i in xrange(len(g)-window+1):
|
1151
|
yield ( g[i:i+prev] + [' '.join(g[i+prev:i+pm])] + g[i+prev+middle:i+window] )
|
1152
|
else:
|
1153
|
patt = re.compile(pattern,re.UNICODE)
|
1154
|
for i in xrange(len(g)-window+1):
|
1155
|
mid = ' '.join(g[i+prev:i+pm])
|
1156
|
if patt.search(mid):
|
1157
|
yield ( g[i:i+prev] + [mid] + g[i+pm:i+window] )
|
1158
|
elif prev<0:
|
1159
|
prev = abs(prev)
|
1160
|
window = nextlen + middle
|
1161
|
winprev = [''] * prev
|
1162
|
winprev = deque(winprev, prev)
|
1163
|
if middle == 1:
|
1164
|
patt = re.compile(pattern,re.UNICODE)
|
1165
|
for i in xrange(len(g)-window + 1):
|
1166
|
if patt.search(g[i]):
|
1167
|
yield tuple(itertools.chain(winprev,(g[i:i+window])))
|
1168
|
else:
|
1169
|
winprev.append(g[i])
|
1170
|
else :
|
1171
|
patt = re.compile(pattern,re.UNICODE)
|
1172
|
for i in xrange(len(g)-window + 1):
|
1173
|
mid = ' '.join(g[i:i+middle])
|
1174
|
if patt.search(g[i]):
|
1175
|
yield tuple(itertools.chain(winprev, ([mid] + g[i+middle:i+window] )))
|
1176
|
else:
|
1177
|
winprev.append(g[i])
|
1178
|
|
1179
|
textwindow.registered=True
|
1180
|
|
1181
|
|
1182
|
def textwindow2s(*args):
|
1183
|
"""
|
1184
|
.. function:: textwindow2s(text, prev_word_count, middle_word_count, next_word_count, pattern)
|
1185
|
|
1186
|
Returns a rolling window in the text. The window includes numberofprev words before the middle word and numberofnext words after the middleword.
|
1187
|
You may filter your window using a pattern.
|
1188
|
|
1189
|
Examples:
|
1190
|
|
1191
|
|
1192
|
>>> sql("select textwindow2s('This is a test phrase',2,1,1) ")
|
1193
|
prev | middle | next
|
1194
|
-------------------------
|
1195
|
| This | is
|
1196
|
This | is | a
|
1197
|
This is | a | test
|
1198
|
is a | test | phrase
|
1199
|
a test | phrase |
|
1200
|
|
1201
|
>>> sql("select textwindow2s('This is a test phrase',2,1,1, '\w{4}') ")
|
1202
|
prev | middle | next
|
1203
|
------------------------
|
1204
|
| This | is
|
1205
|
is a | test | phrase
|
1206
|
a test | phrase |
|
1207
|
|
1208
|
"""
|
1209
|
g = args[0].split(' ')
|
1210
|
yield tuple(('prev','middle','next'))
|
1211
|
|
1212
|
try:
|
1213
|
prev = args[1]
|
1214
|
except IndexError:
|
1215
|
prev = 0
|
1216
|
|
1217
|
try:
|
1218
|
middle = args[2]
|
1219
|
except IndexError:
|
1220
|
middle = 1
|
1221
|
|
1222
|
try:
|
1223
|
nextlen = args[3]
|
1224
|
try:
|
1225
|
nextlen = int(nextlen)
|
1226
|
except:
|
1227
|
raise functions.OperatorError('textwindow2s','Third argument should be an integer')
|
1228
|
except IndexError:
|
1229
|
nextlen = 0
|
1230
|
|
1231
|
if len(args) > 4:
|
1232
|
try:
|
1233
|
patt = re.compile(args[4])
|
1234
|
except:
|
1235
|
raise functions.OperatorError('textwindow2s','Fourth argument must be string or compiled pattern')
|
1236
|
for i in xrange(len(g)-middle+1):
|
1237
|
im = i+middle
|
1238
|
mid = ' '.join(g[i:im])
|
1239
|
if patt.search(mid):
|
1240
|
yield (' '.join(g[max(i-prev,0):i]),mid,' '.join(g[im:im+nextlen]))
|
1241
|
else:
|
1242
|
for i in xrange(len(g)-middle+1):
|
1243
|
im = i+middle
|
1244
|
yield (' '.join(g[max(i-prev,0):i]),' '.join(g[i:im]),' '.join(g[im:im+nextlen]))
|
1245
|
|
1246
|
textwindow2s.registered=True
|
1247
|
|
1248
|
|
1249
|
if not ('.' in __name__):
|
1250
|
"""
|
1251
|
This is needed to be able to test the function, put it at the end of every
|
1252
|
new function you create
|
1253
|
"""
|
1254
|
import sys
|
1255
|
import setpath
|
1256
|
from functions import *
|
1257
|
testfunction()
|
1258
|
if __name__ == "__main__":
|
1259
|
reload(sys)
|
1260
|
sys.setdefaultencoding('utf-8')
|
1261
|
import doctest
|
1262
|
doctest.testmod()
|