Project

General

Profile

1
# coding: utf-8
2
import setpath
3
import re
4
import functions
5
import unicodedata
6
import hashlib
7
import zlib
8
import itertools
9
from collections import deque
10
from lib import jopts
11

    
12
# Increase regular expression cache
13
try:
14
    re._MAXCACHE = 1000
15
except:
16
    pass
17

    
18
# Every regular expression containing \W \w \D \d \b \S \s needs to be compiled
19
# like below. If you want to embed the UNICODE directive inside the
20
# regular expression use:
21
# (?u) like re.sub(ur'(?u)[\W\d]', ' ', o)
22
delete_numbers_and_non_letters=re.compile(ur'[\W]',re.UNICODE)
23
delete_non_letters=re.compile(ur'[\W]',re.UNICODE)
24
delete_word_all=re.compile(ur'\w+\sall',re.UNICODE)
25
delete_word_all_and_or=re.compile(ur'\w+\sall\s(?:and|or)',re.UNICODE)
26
text_tokens = re.compile(ur'([\d.]+\b|\w+|\$[\d.]+)', re.UNICODE)
27
strip_remove_newlines=re.compile(u'(?:\\s+$|^\\s+|(?<=[^\\s\\d\\w.;,!?])\n+)', re.UNICODE)
28
reduce_spaces=re.compile(ur'\s+', re.UNICODE)
29
cqlterms=('title', 'subject', 'person', 'enter', 'creator', 'isbn')
30

    
31
def keywords(*args):
32

    
33
    """
34
    .. function:: keywords(text1, [text2,...]) -> text
35

    
36
    Returns the keywords inside a single column (text1) or aggregated
37
    multiple columns.
38

    
39
    Examples:
40

    
41
    >>> table1('''
42
    ... first(second)   third+fourth
43
    ... πρωτο(δευτερο)  τριτο+τέταρτο
44
    ... 'πέμπτο all'      'qwer.zxcv'
45
    ... ''')
46
    >>> sql("select keywords(a,b) from table1")
47
    keywords(a,b)
48
    ---------------------------------------------------
49
    first second third fourth
50
    πρωτο δευτερο τριτο τέταρτο
51
    πέμπτο all qwer zxcv
52
    """
53

    
54
    out=text_tokens.findall(args[0])
55
    for i in args[1:]:
56
        out+=text_tokens.findall(i)
57

    
58
    return ' '.join((x for x in out if x != '.'))
59

    
60
keywords.registered=True
61

    
62

    
63
def cqlkeywords(*args):
64

    
65
    """
66
    .. function:: cqlkeywords(text1, [text2,...]) -> text
67

    
68
    Returns the keywords inside a single column (text1) or aggregated
69
    from multiple columns.
70

    
71
    The difference of cqlkeywords to keywords is that cqlkeywords also
72
    strips cql syntax like "title all" or "author all" and plain cql directives
73
    like 'creator', 'title'...
74

    
75
    Examples:
76

    
77
    >>> table1('''
78
    ... first(second)   third+fourth
79
    ... πρωτο(δευτερο)  τριτο_τέταρτο
80
    ... 'πέμπτο all'      'έκτο title all τεστ'
81
    ... 'title all and something' 'other'
82
    ... 'title and something' 'other'
83
    ... ''')
84
    >>> sql("select cqlkeywords(a,b) from table1")
85
    cqlkeywords(a,b)
86
    ---------------------------------------------------
87
    first second third fourth
88
    πρωτο δευτερο τριτο_τέταρτο
89
    έκτο τεστ
90
    something other
91
    and something other
92
    """
93

    
94
    out=[]
95
    for i in args:
96
        o=i.lower()
97
        o=delete_non_letters.sub(' ',o)
98
        o=delete_word_all_and_or.sub('',o)
99
        o=delete_word_all.sub('',o)
100
        o=reduce_spaces.sub(' ',o)
101
        o=o.strip()
102
        o=o.split(' ')
103

    
104
        for k in o:
105
            if len(k)>0 and k not in cqlterms:
106
                out.append(k)
107

    
108
    return ' '.join(out)
109

    
110
cqlkeywords.registered=True
111

    
112

    
113
def kwnum(*args):
114

    
115
    """
116
    .. function:: kwnum(text1, [text2,...]) -> int
117

    
118
    Returns the number of simple keywords in a string.
119
    Its input should be words separated by spaces, as returned by
120
    cqlkeywords or keywords.
121

    
122
    Examples:
123

    
124
    >>> table1('''
125
    ... 'word1 word2 word3'
126
    ... 'word1 word2'
127
    ... 'word'
128
    ... ''')
129
    >>> sql("select kwnum(a) from table1")
130
    kwnum(a)
131
    --------
132
    3
133
    2
134
    1
135
    """
136

    
137
    o=0
138
    for i in args:
139
        o+=len(i.split(' '))
140

    
141
    return o
142

    
143
kwnum.registered=True
144

    
145
def uniqueterms(*args):
146
    """
147
    .. function:: uniqueterms(text1, [text2,...]) -> text
148

    
149
    Returns the unique terms of an input string.
150

    
151
    Examples:
152

    
153
    >>> table1('''
154
    ... 'word1 word2 word2'
155
    ... 'word1 word2 word1'
156
    ... 'word'
157
    ... ''')
158
    >>> sql("select uniqueterms(a) from table1")
159
    uniqueterms(a)
160
    --------------
161
    word1 word2
162
    word1 word2
163
    word
164
    """
165

    
166
    o=set()
167
    l=[]
168
    for i in args:
169
        for t in i.split(' '):
170
            if t not in o and not t=='':
171
                o.add(t)
172
                l.append(t)
173

    
174
    return ' '.join(l)
175

    
176
uniqueterms.registered=True
177

    
178

    
179
match_field_all=re.compile('(title|isbn|issn|subject|creator|language|type)\sall',re.UNICODE)
180

    
181
def cqlfields(*args):
182

    
183
    """
184
    This functions returns the keywords inside a single column or aggregated
185
    from multiple columns. It plays well with Unicode.
186

    
187
    The difference of cqlkeywords to keywords is that cqlkeywords also
188
    strips cql syntax like "title all" or "author all".
189

    
190
    >>> table1('''
191
    ... '(title all "scrieri") and (creator all "arghezi") and (title all "other")'
192
    ... '("maschinenschreiben") and (language all "ger")'
193
    ... '("sauer") and ("übungsbuch")'
194
    ... ''')
195
    >>> sql("select cqlfields(a) from table1")
196
    cqlfields(a)
197
    -------------------
198
    title creator title
199
    language
200
    <BLANKLINE>
201
    """
202

    
203
    out=[]
204
    for i in args:
205
        o=i.lower()
206
        o=delete_numbers_and_non_letters.sub(' ',o)
207
        fields=match_field_all.findall(o)
208

    
209
        for k in fields:
210
            out.append(k)
211
    return ' '.join(out)
212

    
213

    
214
cqlfields.registered=True
215

    
216
def comprspaces(*args):
217
    """
218
    .. function:: comprspaces(text1, [text2,...]) -> text
219

    
220
    This function strips (from the beginning and the end) and compresses
221
    the spaces in its input.
222

    
223
    Examples:
224

    
225
    >>> table1('''
226
    ... '   an example    with spaces      '    'another    example with spaces         '
227
    ... ''')
228
    >>> sql("select comprspaces(a,b) from table1")
229
    comprspaces(a,b)
230
    --------------------------------------------------
231
    an example with spaces another example with spaces
232
    """
233

    
234
    if len(args) == 1:
235
        return reduce_spaces.sub(' ', strip_remove_newlines.sub('', args[0]))
236

    
237
    out=[]
238
    for i in args:
239
        o=reduce_spaces.sub(' ', strip_remove_newlines.sub('', i))
240
        out+=[o]
241

    
242
    return ' '.join(out)
243

    
244
comprspaces.registered=True
245

    
246
reduce_special_characters=re.compile(ur'(?:[\s\n,.;]+|[^\w,.\s]+)',re.UNICODE)
247
reduce_underscore = re.compile(ur'(\b_+\b)',re.UNICODE)
248

    
249
def normreplace(a):
250
    if (a.group()[0] in ' \t\n.,;'):
251
        return ' '
252

    
253
    return '_';
254

    
255
def normalizetext(*args):
256
    """
257
    .. function:: normalizetext(text1, [text2,...]) -> text
258

    
259
    Normalizes a text by replacing all the non-words except \s\n,.; with '_'
260

    
261
    Examples:
262

    
263
    >>> table1('''
264
    ... first(second)   third+fourth
265
    ... πρωτο(δευτερο)  τριτο+τέταρτο
266
    ... 'πέμπτο all'      'έκτο title all τεστ'
267
    ... ''')
268
    >>> sql("select normalizetext(a,b) from table1")
269
    normalizetext(a,b)
270
    ----------------------------------------------------
271
    first_second_ third_fourth
272
    πρωτο_δευτερο_ τριτο_τέταρτο
273
    πέμπτο all έκτο title all τεστ
274
    """
275
    out=[]
276
    for o in args:
277
        o=reduce_special_characters.sub(normreplace,o)
278
        o=reduce_underscore.sub(' ',o)
279
        out.append(reduce_spaces.sub(' ', o).strip())
280

    
281
    return ' '.join(out)
282

    
283
normalizetext.registered=True
284

    
285

    
286
query_regular_characters=re.compile(ur"""^[·∆©(́−·¨¬…‐"•΄€„”“‘’´«»’ʹ–\w\s\[!-~\]]*$""", re.UNICODE)
287

    
288
def isvalidutf8(*args):
289

    
290
    """
291
    .. function:: isvalidutf8(text) -> 1/0
292

    
293
    Returns 1 if the input text is in valid UTF-8 format, or 0 if not.
294
    This function is used to find corrupted UTF-8 strings with a heuristic
295
    based on non common characters.
296

    
297
    Examples:
298

    
299
    >>> table1('''
300
    ... test
301
    ... δοκιμή!
302
    ... sévignÃ
303
    ... évezred
304
    ... ''')
305
    >>> sql("select isvalidutf8(a) from table1")
306
    isvalidutf8(a)
307
    --------------
308
    1
309
    1
310
    1
311
    1
312
    """
313

    
314
    for i in args:
315
        if i==None:
316
            return 0
317
        if not query_regular_characters.match(i):
318
            return 0
319

    
320
    return 1
321

    
322
isvalidutf8.registered=True
323

    
324

    
325
characters_to_clean=re.compile(ur"""[^\w!-~]""", re.UNICODE)
326

    
327
def utf8clean(*args):
328

    
329
    """
330
    .. function:: utf8clean(text) -> text
331

    
332
    Removes control characters from input utf-8 text.
333

    
334
    Examples:
335

    
336
    >>> table1('''
337
    ... test
338
    ... δοκιμή!
339
    ... sévignÃ
340
    ... évezred
341
    ... ''')
342
    >>> sql("select utf8clean(a) from table1")
343
    utf8clean(a)
344
    -------------
345
    test
346
    δοκιμή!
347
    sévignÃ
348
    évezred
349
    """
350

    
351
    def cleanchar(c):
352
        c=c.group()[0]
353
        if c != '\n' and unicodedata.category(c)[0] == 'C':
354
            return u''
355
        else:
356
            return c
357

    
358
    o=''
359
    for i in args:
360
        if type(i) in (str,unicode):
361
            o+=characters_to_clean.sub(cleanchar, i)
362
        else:
363
            o+=unicode(i, errors='replace')
364

    
365
    return o
366

    
367
utf8clean.registered=True
368

    
369
def regexpr(*args):
370

    
371
    """
372
    .. function:: regexp(pattern,expression[,replacestr])
373

    
374
    This function returns a match to the first parenthesis of *pattern*
375
    or replaces the matches of *pattern* in *expression* with *replacestr*.
376
    `Pattern Syntax <http://docs.python.org/library/re.html#re-syntax>`_ is
377
    according to python's re module.
378

    
379
    Examples use `inversion`.
380
    
381
    Examples:
382

    
383
    >>> table1('''
384
    ... 25
385
    ... ''')
386
    
387
    >>> sql("regexpr 'start\s(\w+)\send' 'start otherword end'  ")
388
    regexpr('start\s(\w+)\send','start otherword end')
389
    --------------------------------------------------
390
    otherword
391

    
392
    >>> sql("regexpr '\W+' '@#$%@$#% tobereplaced @#$%@#$%' 'nonword'  ")
393
    regexpr('\W+','@#$%@$#% tobereplaced @#$%@#$%','nonword')
394
    ---------------------------------------------------------
395
    nonwordtobereplacednonword
396

    
397
    >>> sql("select regexpr('(\w+).*?(\w+)', 'one two three')")
398
    regexpr('(\w+).*?(\w+)', 'one two three')
399
    -----------------------------------------
400
    ["one","two"]
401
    """
402
    if len(args)<2:
403
        return
404

    
405
    if len(args)==2:
406
        a=re.search(args[0], unicode(args[1]),re.UNICODE)
407
        if a!=None:
408
            if len(a.groups())>0:
409
                return jopts.toj(a.groups())
410
            else:
411
                return True
412
        else:
413
            return None
414

    
415
    if len(args) == 3:
416
        try:
417
            return re.sub(args[0], args[2], args[1], flags=re.UNICODE)
418
        except TypeError:
419
            return re.sub(args[0], args[2], args[1])
420

    
421
regexpr.registered = True
422

    
423
def regexprfindall(*args):
424
    """
425
    .. function:: regexprfindall(pattern,text)
426

    
427
    This function returns *all* matches of *pattern* in text.
428

    
429
    Examples:
430

    
431
    >>> sql("select regexprfindall('\w+', 'one')")
432
    regexprfindall('\w+', 'one')
433
    ----------------------------
434
    ["one"]
435

    
436
    >>> sql("select regexprfindall('\w+', 'one two three')")
437
    regexprfindall('\w+', 'one two three')
438
    --------------------------------------
439
    ["one","two","three"]
440
    """
441
    
442
    if len(args)!=2:
443
        raise functions.OperatorError('regexprfindall', 'Two parameters should be provided')
444

    
445
    return jopts.tojstrict(re.findall(args[0], unicode(args[1]),re.UNICODE))
446

    
447
regexprfindall.registered=True
448

    
449
def regexprmatches(*args):
450

    
451
    """
452
    .. function:: regexprmatches(pattern, arg)
453

    
454
    This function returns true if the pattern matches arg or false otherwise.
455

    
456
    Examples use `inversion`.
457

    
458
    Examples:
459

    
460
    >>> sql("regexprmatches '(a)' 'qwer a qwer'  ")
461
    regexprmatches('(a)','qwer a qwer')
462
    -----------------------------------
463
    1
464

    
465
    """
466
    if len(args)!=2:
467
        raise functions.OperatorError('regexprmatches', 'Two parameters should be provided')
468

    
469
    a=re.search(args[0], unicode(args[1]),re.UNICODE)
470
    if a!=None:
471
        return True
472
    else:
473
        return False
474

    
475
regexprmatches.registered=True
476

    
477

    
478
def regexpcountwithpositions(pattern,expression,start = 0,min = 0.5,multiply = 1):
479
    """
480
    .. function:: regexpcountwithpositions(pattern, expression,start = 0,min = 0.5,multiply = 1,)
481

    
482
        Returns a score of positioned matches of pattern in expression.
483

    
484
    Examples:
485

    
486
    >>> sql("regexpcountwithpositions 'start' 'start end start'  ")
487
    regexpcountwithpositions('start','start end start')
488
    ---------------------------------------------------
489
    1.75
490

    
491
    >>> sql("regexpcountwithpositions 'start' 'start end start'  ")
492
    regexpcountwithpositions('start','start end start')
493
    ---------------------------------------------------
494
    1.75
495

    
496
    >>> sql("regexpcountwithpositions 'first' 'first second third fourth'")
497
    regexpcountwithpositions('first','first second third fourth')
498
    -------------------------------------------------------------
499
    0.75
500

    
501
    >>> sql("regexpcountwithpositions 'fourth' 'first second third fourth'")
502
    regexpcountwithpositions('fourth','first second third fourth')
503
    --------------------------------------------------------------
504
    1.5
505

    
506
    >>> sql("regexpcountwithpositions 'fourth' 'first second third fourth' 1")
507
    regexpcountwithpositions('fourth','first second third fourth','1')
508
    ------------------------------------------------------------------
509
    0.5
510
    """
511

    
512
    count = 0
513
    if start == 0:
514
        total = 0
515
        for i in re.finditer(pattern+'|(\s)',expression,re.UNICODE):
516
            count += 1
517
            if i.group()!=' ':
518
                total += count * multiply
519
        if total == 0:
520
            return 0.0
521
        else:
522
            if count == 0 :
523
                return min
524
            return min + total / float(count)
525
    else:
526
        matches = []
527
        total = 0
528
        for i in re.finditer(pattern+'|(\s)',expression,re.UNICODE):
529
            count += 1
530
            if i.group()!=' ':
531
                matches.append(count)
532
                total += count * multiply
533
        if total == 0:
534
            return 0.0
535
        else:
536
            if count == 0:
537
                return min
538
            return min + sum(count - i for i in matches) / float(count)
539

    
540
regexpcountwithpositions.registered=True
541

    
542

    
543
def regexpcountuniquematches(*args):
544
    """
545
    .. function:: regexpcountuniquematches(pattern, expression)
546

    
547
        Returns the number of matches of pattern in expression.
548

    
549
    Examples:
550

    
551

    
552

    
553
    >>> sql("regexpcountuniquematches 'start' 'start end start'  ")
554
    regexpcountuniquematches('start','start end start')
555
    ---------------------------------------------------
556
    1
557

    
558
    >>> sql("regexpcountuniquematches 'start end' 'start end start'  ")
559
    regexpcountuniquematches('start end','start end start')
560
    -------------------------------------------------------
561
    1
562

    
563
    """
564

    
565
    return len(set(re.findall(args[0], unicode(args[1]), re.UNICODE)))
566

    
567
regexpcountuniquematches.registered=True
568

    
569

    
570
def regexpcountwords(*args):
571
    """
572
    .. function:: regexpcountwords(pattern, expression)
573

    
574
        Returns the number of matches of pattern in expression. If a match includes more than one words then it returns the number of the words.
575

    
576
    Examples:
577

    
578
    >>> sql("regexpcountwords 'start' 'start end start'  ")
579
    regexpcountwords('start','start end start')
580
    -------------------------------------------
581
    2
582

    
583
    >>> sql("regexpcountwords 'start end' 'start end start'  ")
584
    regexpcountwords('start end','start end start')
585
    -----------------------------------------------
586
    2
587
    """
588

    
589
    return sum(((i.group().strip().count(' ')+1)  for i in re.finditer(args[0],unicode(args[1]),re.UNICODE) ))
590

    
591
regexpcountwords.registered=True
592

    
593

    
594
def contains(*args):
595
    """
596
    .. function:: contains(str1,str2) -> bool
597

    
598
    Returns true if string *str1* contains *str2*.
599

    
600
    Examples:
601

    
602
    >>> sql("select contains('test string', 'str') as test  ")
603
    test
604
    ----
605
    1
606
    >>> sql("select contains('test string', 'nostr') as test  ")
607
    test
608
    ----
609
    0
610
    """
611
    if len(args)!=2:
612
        raise functions.OperatorError("included","operator takes exactly two arguments")
613
    if (args[1] in args[0]):
614
        return True
615
    return False
616

    
617
contains.registered=True
618

    
619

    
620
def unitosuni(*args):
621
    """
622
    .. function:: unitosuni(str)
623

    
624
    Returns *str* replacing non-ascii characters with their equivalent
625
    unicode code point literal at the \\u00 format.
626

    
627
    Examples:
628

    
629
    >>> sql("select unitosuni('brûlé') as test  ")
630
    test
631
    ---------------
632
    br\\u00fbl\\u00e9
633
    >>> sql("select sunitouni(null)")
634
    sunitouni(null)
635
    ---------------
636
    None
637
    >>> sql("select unitosuni(9)")
638
    unitosuni(9)
639
    ------------
640
    9
641
    """
642
    if len(args)!=1:
643
        raise functions.OperatorError("unitosuni","operator takes only one arguments")
644
    if args[0]==None:
645
        return None
646
    try:
647
        return repr(unicode(args[0])).replace('\\x','\\u00')[2:-1]
648
    except KeyboardInterrupt:
649
        raise
650
    except Exception:
651
        return args[0]
652

    
653
unitosuni.registered=True
654

    
655

    
656
def sunitouni(*args):
657
    """
658
    .. function:: sunitouni(str)
659

    
660
    Returns *str* replacing literal unicode code points to their string representation.
661

    
662
    Examples:
663

    
664
    >>> sql("select sunitouni('br\\u00fbl\\u00e9') as test  ")
665
    test
666
    -------
667
    brûlé
668
    >>> sql("select sunitouni('\\u that is not a unicode code point') as test  ")
669
    test
670
    -----------------------------------
671
    \u that is not a unicode code point
672
    >>> sql("select sunitouni(null)")
673
    sunitouni(null)
674
    ---------------
675
    None
676
    >>> sql("select sunitouni(9)")
677
    sunitouni(9)
678
    ------------
679
    9
680
    """
681
    if len(args)!=1:
682
        raise functions.OperatorError("sunitouni","operator takes only one arguments")
683
    if args[0]==None:
684
        return None
685
    kk="u'%s'" %(unicode(args[0]).replace("'","\\'"))
686
    try:
687
        return eval(kk)
688
    except KeyboardInterrupt:
689
        raise
690
    except Exception:
691
        return args[0]
692

    
693
sunitouni.registered=True
694

    
695

    
696
def stripchars(*args):
697
    """
698
    .. function:: stripchars(str[,stripchars])
699

    
700
    Returns *str* removing leading and trailing whitespace characters
701
    or *stripchars* characters if given. Works like python's
702
    `strip function <http://docs.python.org/library/stdtypes.html#str.strip>`_.
703

    
704

    
705
    Examples:
706

    
707
    >>> sql("select stripchars(' initial and final spaces  ') as test  ")
708
    test
709
    ------------------------
710
    initial and final spaces
711
    >>> sql("select stripchars(' <initial and final spaces>  ',' <>') as test  ")
712
    test
713
    ------------------------
714
    initial and final spaces
715
    >>> sql("select stripchars(null)")
716
    stripchars(null)
717
    ----------------
718
    None
719
    """
720
    if len(args)<1:
721
        raise functions.OperatorError("stripchars","operator takes at least one arguments")
722
    if args[0]==None:
723
        return None
724
    if len(args)<2:
725
        return unicode(args[0]).strip()
726
    return unicode(args[0]).strip(args[1])
727
stripchars.registered=True
728

    
729

    
730
def reencode(*args):
731
    if len(args)!=1:
732
        raise functions.OperatorError("reencode","operator takes only one arguments")
733

    
734
    us=args[0]
735
    if us==None:
736
        return None
737
    us=unicode(us)
738
    try:
739
        a=unicode(us.encode('iso-8859-1'),'utf-8')
740
        return a
741
    except KeyboardInterrupt:
742
        raise
743
    except Exception:
744
        try:
745
            a=unicode(us.encode('windows-1252'),'utf-8')
746
            return a
747
        except Exception:
748
            return us
749

    
750
reencode.registered=False
751

    
752

    
753
def normuni(*args):
754
    """
755
    .. function:: normuni(str)
756

    
757
    Returns *str* normalised in the composed unicode normal form without replacing
758
    same look characters. For example this 'À' character can be encoded with one or two
759
    different characters, :func:`normuni` returns an one-character encoded version. This
760
    function is important to check true strings equality.
761

    
762
    Functions :func:`sunitouni` and :func:`unitosuni` are used in the examples to make it more comprehensive.
763

    
764
    Examples:
765

    
766
    .. note::
767
        Returned results in the next two examples should look the same,
768
        if not that is a bug at the combined characters rendering of the shell
769
        that the documentation was created.
770

    
771
    >>> sql("select sunitouni('C\u0327') as test  ")
772
    test
773
    ----
774

775
    >>> sql("select normuni(sunitouni('C\u0327')) as test  ")
776
    test
777
    ----
778
    Ç
779
    >>> sql("select unitosuni(normuni(sunitouni('C\u0327'))) as test  ")
780
    test
781
    ------
782
    \u00c7
783
    """
784
    if len(args)!=1:
785
        raise functions.OperatorError("normuni","operator takes only one arguments")
786
    if args[0]==None:
787
        return None    
788
    return unicodedata.normalize('NFC', args[0])
789

    
790
normuni.registered=True
791

    
792

    
793
def hashmd5(*args):
794
    """
795
    .. function:: hashmd5(args)
796

    
797
    Returns an MD5 hash of args. Numbers are converted to text before hashing is
798
    performed.
799

    
800
    Examples:
801

    
802
    >>> sql("select hashmd5(65)")
803
    hashmd5(65)
804
    --------------------------------
805
    fc490ca45c00b1249bbe3554a4fdf6fb
806
    >>> sql("select hashmd5(6,5)")
807
    hashmd5(6,5)
808
    --------------------------------
809
    f0d95c20cde50e3ca03cab53f986b6c3
810
    >>> sql("select hashmd5(5)")
811
    hashmd5(5)
812
    --------------------------------
813
    e4da3b7fbbce2345d7772b0674a318d5
814
    >>> sql("select hashmd5('5')")
815
    hashmd5('5')
816
    --------------------------------
817
    7000aaf68ca7a93da0af3d03850571c2
818
    """
819

    
820
    if len(args)==1:
821
        return hashlib.md5(repr(args[0])).hexdigest()
822
    else:
823
        return hashlib.md5(chr(30).join([repr(x) for x in args])).hexdigest()
824

    
825
hashmd5.registered=True
826

    
827

    
828
def hashmd5mod(*args):
829
    """
830
    .. function:: hashmd5mod(args, divisor) -> int
831

    
832
    Returns the *modulo* with divisor number of the MD5 hash of args.
833
    Numbers are converted to text before hashing is performed.
834

    
835
    Examples:
836

    
837
    >>> sql("select hashmd5mod(65, 3)")
838
    hashmd5mod(65, 3)
839
    -----------------
840
    0
841

    
842
    >>> sql("select hashmd5mod(6,5, 4)")
843
    hashmd5mod(6,5, 4)
844
    ------------------
845
    2
846

    
847
    >>> sql("select hashmd5mod(5, 5)")
848
    hashmd5mod(5, 5)
849
    ----------------
850
    3
851
    
852
    >>> sql("select hashmd5mod('5', 5)")
853
    hashmd5mod('5', 5)
854
    ------------------
855
    4
856
    """
857

    
858
    if len(args)==2:
859
        return int(hashlib.md5(repr(args[0])).hexdigest(),16) % args[-1]
860
    else:
861
        return int(hashlib.md5(chr(30).join([repr(x) for x in args])).hexdigest(),16) % args[-1]
862

    
863
hashmd5mod.registered=True
864

    
865

    
866
def crc32(*args):
867
    """
868
    .. function:: crc32(args) -> int
869

    
870
    Returns the CRC32 of args. Numbers are converted to text before hashing is
871
    performed.
872

    
873
    Examples:
874

    
875
    >>> sql("select crc32(65)")
876
    crc32(65)
877
    ----------
878
    2658551721
879

    
880
    >>> sql("select crc32(6,5)")
881
    crc32(6,5)
882
    ----------
883
    1565899724
884

    
885
    >>> sql("select crc32(5)")
886
    crc32(5)
887
    ----------
888
    2226203566
889

    
890
    >>> sql("select crc32('5')")
891
    crc32('5')
892
    ----------
893
    1201448970
894
    """
895

    
896
    if len(args)==1:
897
        return zlib.crc32(repr(args[0])) & 0xffffffff
898
    else:
899
        return zlib.crc32(chr(30).join([repr(x) for x in args])) & 0xffffffff
900

    
901
crc32.registered=True
902

    
903
def hashmodarchdep(*args):
904
    """
905
    .. function:: hashmodarchdep(args, divisor) -> int
906

    
907
    Returns a hash of the args.
908

    
909
    .. note::
910

    
911
        This hash function is architecture dependent (32bit vs 64bit).
912

    
913
    Examples:
914

    
915
    >>> sql("select hashmodarchdep(65,5)") #doctest:+ELLIPSIS
916
    hashmodarchdep(65,5)
917
    --------------------
918
    ...
919

    
920
    >>> sql("select hashmodarchdep(6,5)") #doctest:+ELLIPSIS
921
    hashmodarchdep(6,5)
922
    -------------------
923
    ...
924

    
925
    >>> sql("select hashmodarchdep(5,5)") #doctest:+ELLIPSIS
926
    hashmodarchdep(5,5)
927
    -------------------
928
    ...
929

    
930
    >>> sql("select hashmodarchdep('5',5)") #doctest:+ELLIPSIS
931
    hashmodarchdep('5',5)
932
    ---------------------
933
    ...
934
    """
935

    
936
    return hash(tuple(args[:-1])) % args[-1]
937

    
938
hashmodarchdep.registered=True
939

    
940

    
941
def textreferences(txt,maxlen = 5,pattern = r'(\b|_)((1[5-9]\d{2,2})|(20\d{2,2}))(\b|_)' ):
942
    """
943
    .. function:: textreferences(text, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
944

    
945
    Returns the "Reference" section of documents. To find it, it searches for parts of the document that
946
    have a high density of pattern matches.
947

    
948
    .. parameters:: txt,maxlen,pattern
949
       txt: input text.
950
       maxlen: the size of the scrolling window over the text in which the density is calculated.
951
       pattern: regular expression that is matched against the lines of the text. By default the pattern matches
952
                year occurences so as to extract sections that look like references.
953

    
954
    
955
    Examples:
956

    
957
    >>> table1('''
958
    ... eeeeeeeeeeeeee
959
    ... gggggggggggggg
960
    ... aaaaaaaaaaaaaa
961
    ... bbbbbbbbbbbbbb
962
    ... aaa_1914_ccccc
963
    ... bbb_2014_bbbbb
964
    ... dddd_2008_ddddddd
965
    ... cccc_2005_ccccc
966
    ... ccccc_2014_ccccc
967
    ... dddddd_2009_ddddd
968
    ... gggggggggggggg
969
    ... ''')
970

    
971
    >>> sql("select textreferences(group_concat(a,'\\n'),1,'(\b|_)(1|2)\d{3,3}(\b|_)') as a from table1")
972
    a
973
    --------------------------------------------------------------------------------------------------
974
    aaa_1914_ccccc
975
    bbb_2014_bbbbb
976
    dddd_2008_ddddddd
977
    cccc_2005_ccccc
978
    ccccc_2014_ccccc
979
    dddddd_2009_ddddd
980

    
981

    
982
    If an inadequate amount of newlines is found, it returns the text as is.
983

    
984
    >>> sql("select textreferences(group_concat(a,'.')) from table1")
985
    textreferences(group_concat(a,'.'))
986
    -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
987
    eeeeeeeeeeeeee.gggggggggggggg.aaaaaaaaaaaaaa.bbbbbbbbbbbbbb.aaa_1914_ccccc.bbb_2014_bbbbb.dddd_2008_ddddddd.cccc_2005_ccccc.ccccc_2014_ccccc.dddddd_2009_ddddd.gggggggggggggg
988

    
989

    
990
    >>> sql("select textreferences('')")
991
    textreferences('')
992
    ------------------
993
    <BLANKLINE>
994
    """
995

    
996
    exp = re.sub('\r\n','\n',txt)
997

    
998
    if exp.count('\n')<10:
999
        return exp
1000
    references = []
1001
    reversedtext = iter(reversed(exp.split('\n')))
1002
    reversedtext2 = iter(reversed(exp.split('\n')))
1003
    results = []
1004
    densities = []
1005
    winlen = 0
1006

    
1007

    
1008
    for i in reversedtext:
1009
        if len(i)>10:
1010
            if re.search(pattern,i):
1011
                    results.append(1)
1012
            else:
1013
                    results.append(0)
1014
    tmpmax = 0
1015
    maximum = 0
1016
    win = deque(('' for _ in xrange(maxlen)),maxlen)
1017
    for i in results:
1018

    
1019
        if winlen<maxlen:
1020
            winlen+=1
1021
            win.append(i)
1022
            tmpmax += i
1023
        else:
1024
            tmpmax -= win.popleft()
1025
            tmpmax += i
1026
            win.append(i)
1027
        densities.append(float(tmpmax)/maxlen)
1028
        if float(tmpmax)/maxlen>maximum:
1029
            maximum = float(tmpmax)/maxlen
1030
    #threshold = sorted(densities)[len(densities)/2]
1031
    try:
1032
        threshold =  sum(densities)/len(densities)
1033
    except:
1034
        threshold = 0
1035

    
1036
    current = 0
1037
    for i in reversedtext2:
1038
        if len(i)>10:
1039
            if densities[current] >= threshold:
1040
                references.append(i)
1041
            current+=1
1042
    return  '\n'.join(reversed(references))
1043

    
1044
textreferences.registered=True
1045

    
1046

    
1047
def textwindow(*args):
1048
    """
1049
    .. function:: textwindow(text, previous_word_count = 0, next_word_count = 0, middle_word_count = 1, pattern = None)
1050

    
1051
    Returns a rolling window over the text. The window includes *previous_word_count* words before the middle word
1052
    and next_word_count words after the middleword. Optionally you may choose more than one words to be in the middle, and filter your window with a regular expression pattern
1053

    
1054
    If the value of prev_word_count or next_word_count is negative, and a pattern exists then all matches of the pattern are
1055
    filtered out from prev and next output.
1056

    
1057
    Examples:
1058

    
1059
    >>> sql("select textwindow('This is a test phrase')  ")
1060
    middle
1061
    ------
1062
    This
1063
    is
1064
    a
1065
    test
1066
    phrase
1067

    
1068
    >>> sql("select textwindow('This is a test phrase',1,1)  ")
1069
    prev1 | middle | next1
1070
    -----------------------
1071
          | This   | is
1072
    This  | is     | a
1073
    is    | a      | test
1074
    a     | test   | phrase
1075
    test  | phrase |
1076

    
1077
    >>> sql("select textwindow('This is a test phrase',1,1,2)  ")
1078
    prev1 | middle      | next1
1079
    ----------------------------
1080
          | This is     | a
1081
    This  | is a        | test
1082
    is    | a test      | phrase
1083
    a     | test phrase |
1084
    test  | phrase      |
1085

    
1086
    >>> sql("select textwindow('This is a test phrase  with pdb codes: 1abc 2bcd 3cde 4bde ',-2,1,2,'\d\w{3}' )  ")
1087
    prev1 | prev2  | middle    | next1
1088
    ----------------------------------
1089
    pdb   | codes: | 1abc 2bcd | 3cde
1090
    pdb   | codes: | 2bcd 3cde | 4bde
1091
    pdb   | codes: | 3cde 4bde |
1092
    pdb   | codes: | 4bde      |
1093

    
1094
    >>> sql("select textwindow('This is a test phrase (123) for filtering middle with a number',1,1,'\d+')  ")
1095
    prev1  | middle | next1
1096
    -----------------------
1097
    phrase | (123)  | for
1098

    
1099
    """
1100
    r = args[0]
1101
    try:
1102
        prev = args[1]
1103
    except IndexError:
1104
        prev = 0
1105
    try:
1106
        nextlen = args[2]
1107
    except IndexError:
1108
        nextlen = 0
1109

    
1110
    middle = 1
1111
    pattern = None
1112
    try:
1113
        if type(args[3]) == int:
1114
            middle = args[3]
1115
        else:
1116
            pattern = args[3]
1117
    except IndexError:
1118
        pass
1119
    
1120
    try :
1121
        if type(args[4]) == int:
1122
            middle = args[4]
1123
        else:
1124
            pattern = args[4]
1125
    except IndexError:
1126
        pass
1127

    
1128
    if pattern == None:
1129
        prev = abs(prev)
1130

    
1131
    yield tuple(itertools.chain( ('prev'+str(x) for x in xrange(1,abs(prev)+1)),('middle',), ('next'+str(y) for y in xrange(1,nextlen + 1)) ))
1132
    g = [''] * prev + r.split(' ') + [''] * ((middle-1)+nextlen)
1133

    
1134
    if prev >= 0:    
1135
        window = prev + nextlen + middle
1136
        pm = prev+middle
1137
        im = prev
1138
        if middle == 1:
1139
            if pattern == None:
1140
                for i in xrange(len(g)-window + 1):
1141
                    yield (g[i:i+window])
1142
            else:
1143
                 patt = re.compile(pattern,re.UNICODE)
1144
                 for i in xrange(len(g)-window + 1):
1145
                    if patt.search(g[i+im]):
1146
                        yield (g[i:i+window])
1147

    
1148
        else :
1149
            if pattern == None:
1150
                for i in xrange(len(g)-window+1):
1151
                    yield (  g[i:i+prev] + [' '.join(g[i+prev:i+pm])] + g[i+prev+middle:i+window]  )
1152
            else:
1153
                 patt = re.compile(pattern,re.UNICODE)
1154
                 for i in xrange(len(g)-window+1):
1155
                    mid = ' '.join(g[i+prev:i+pm])
1156
                    if patt.search(mid):
1157
                        yield (  g[i:i+prev] + [mid] + g[i+pm:i+window]  )
1158
    elif prev<0:
1159
        prev = abs(prev)
1160
        window = nextlen + middle
1161
        winprev = [''] * prev
1162
        winprev = deque(winprev, prev)
1163
        if middle == 1:
1164
             patt = re.compile(pattern,re.UNICODE)
1165
             for i in xrange(len(g)-window + 1):
1166
                if patt.search(g[i]):
1167
                    yield tuple(itertools.chain(winprev,(g[i:i+window])))
1168
                else:
1169
                    winprev.append(g[i])
1170
        else :
1171
             patt = re.compile(pattern,re.UNICODE)
1172
             for i in xrange(len(g)-window + 1):
1173
                mid = ' '.join(g[i:i+middle])
1174
                if patt.search(g[i]):
1175
                    yield tuple(itertools.chain(winprev, ([mid] + g[i+middle:i+window]  )))
1176
                else:
1177
                    winprev.append(g[i])
1178

    
1179
textwindow.registered=True
1180

    
1181

    
1182
def textwindow2s(*args):
1183
    """
1184
    .. function:: textwindow2s(text, prev_word_count, middle_word_count, next_word_count, pattern)
1185

    
1186
        Returns a rolling window in the text. The window includes numberofprev words before the middle word and numberofnext words after the middleword.
1187
        You may filter your window using a pattern.
1188

    
1189
    Examples:
1190

    
1191

    
1192
    >>> sql("select textwindow2s('This is a test phrase',2,1,1)  ")
1193
    prev    | middle | next
1194
    -------------------------
1195
            | This   | is
1196
    This    | is     | a
1197
    This is | a      | test
1198
    is a    | test   | phrase
1199
    a test  | phrase |
1200

    
1201
    >>> sql("select textwindow2s('This is a test phrase',2,1,1, '\w{4}')  ")
1202
    prev   | middle | next
1203
    ------------------------
1204
           | This   | is
1205
    is a   | test   | phrase
1206
    a test | phrase |
1207

    
1208
    """
1209
    g = args[0].split(' ')
1210
    yield tuple(('prev','middle','next'))
1211

    
1212
    try:
1213
        prev = args[1]
1214
    except IndexError:
1215
        prev = 0
1216

    
1217
    try:
1218
        middle = args[2]
1219
    except IndexError:
1220
        middle = 1
1221

    
1222
    try:
1223
        nextlen = args[3]
1224
        try:
1225
            nextlen = int(nextlen)
1226
        except:
1227
            raise functions.OperatorError('textwindow2s','Third argument should be an integer')
1228
    except IndexError:
1229
        nextlen = 0
1230

    
1231
    if len(args) > 4:
1232
        try:
1233
            patt = re.compile(args[4])
1234
        except:
1235
            raise functions.OperatorError('textwindow2s','Fourth argument must be string or compiled pattern')
1236
        for i in xrange(len(g)-middle+1):
1237
            im = i+middle
1238
            mid = ' '.join(g[i:im])
1239
            if patt.search(mid):
1240
                yield (' '.join(g[max(i-prev,0):i]),mid,' '.join(g[im:im+nextlen]))
1241
    else:
1242
        for i in xrange(len(g)-middle+1):
1243
            im = i+middle
1244
            yield (' '.join(g[max(i-prev,0):i]),' '.join(g[i:im]),' '.join(g[im:im+nextlen]))
1245
        
1246
textwindow2s.registered=True
1247

    
1248

    
1249
if not ('.' in __name__):
1250
    """
1251
    This is needed to be able to test the function, put it at the end of every
1252
    new function you create
1253
    """
1254
    import sys
1255
    import setpath
1256
    from functions import *
1257
    testfunction()
1258
    if __name__ == "__main__":
1259
        reload(sys)
1260
        sys.setdefaultencoding('utf-8')
1261
        import doctest
1262
        doctest.testmod()
(19-19/22)