Revision 28793
Added by Dominika Tkaczyk almost 10 years ago
text.py | ||
---|---|---|
350 | 350 |
|
351 | 351 |
def cleanchar(c): |
352 | 352 |
c=c.group()[0] |
353 |
if unicodedata.category(c)[0]=='C':
|
|
353 |
if c != '\n' and unicodedata.category(c)[0] == 'C':
|
|
354 | 354 |
return u'' |
355 | 355 |
else: |
356 | 356 |
return c |
... | ... | |
360 | 360 |
if type(i) in (str,unicode): |
361 | 361 |
o+=characters_to_clean.sub(cleanchar, i) |
362 | 362 |
else: |
363 |
o+=unicode(append(i))
|
|
363 |
o+=unicode(i, errors='replace')
|
|
364 | 364 |
|
365 | 365 |
return o |
366 | 366 |
|
... | ... | |
412 | 412 |
else: |
413 | 413 |
return None |
414 | 414 |
|
415 |
if len(args)==3: |
|
416 |
return re.sub(args[0], args[2], args[1]) |
|
415 |
if len(args) == 3: |
|
416 |
try: |
|
417 |
return re.sub(args[0], args[2], args[1], flags=re.UNICODE) |
|
418 |
except TypeError: |
|
419 |
return re.sub(args[0], args[2], args[1]) |
|
417 | 420 |
|
418 |
regexpr.registered=True
|
|
421 |
regexpr.registered = True
|
|
419 | 422 |
|
420 | 423 |
def regexprfindall(*args): |
421 | 424 |
""" |
... | ... | |
935 | 938 |
hashmodarchdep.registered=True |
936 | 939 |
|
937 | 940 |
|
938 |
def textreferences(txt,maxlen = 5,pattern = r'(\b|_)(1|2)\d{3,3}(\b|_)' ):
|
|
941 |
def textreferences(txt,maxlen = 5,pattern = r'(\b|_)((1[5-9]\d{2,2})|(20\d{2,2}))(\b|_)' ):
|
|
939 | 942 |
""" |
940 | 943 |
.. function:: textreferences(text, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_)) |
941 | 944 |
|
... | ... | |
1030 | 1033 |
except: |
1031 | 1034 |
threshold = 0 |
1032 | 1035 |
|
1033 |
winlen = 0 |
|
1034 |
win = deque(('' for _ in xrange(maxlen)),maxlen) |
|
1035 | 1036 |
current = 0 |
1036 |
start = 0 |
|
1037 | 1037 |
for i in reversedtext2: |
1038 | 1038 |
if len(i)>10: |
1039 |
if winlen == maxlen and densities[current]>=threshold: |
|
1040 |
if start == 1: |
|
1041 |
start = 0 |
|
1042 |
for j in xrange(maxlen/2): |
|
1043 |
references.append(win[j]) |
|
1044 |
|
|
1045 |
references.append(win[maxlen/2]) |
|
1046 |
win.append(i) |
|
1047 |
if winlen<maxlen: |
|
1048 |
winlen+=1 |
|
1049 |
if winlen == maxlen: |
|
1050 |
start = 1 |
|
1051 |
else : |
|
1052 |
current+=1 |
|
1053 |
|
|
1054 |
|
|
1039 |
if densities[current] >= threshold: |
|
1040 |
references.append(i) |
|
1041 |
current+=1 |
|
1055 | 1042 |
return '\n'.join(reversed(references)) |
1056 | 1043 |
|
1057 | 1044 |
textreferences.registered=True |
... | ... | |
1234 | 1221 |
|
1235 | 1222 |
try: |
1236 | 1223 |
nextlen = args[3] |
1224 |
try: |
|
1225 |
nextlen = int(nextlen) |
|
1226 |
except: |
|
1227 |
raise functions.OperatorError('textwindow2s','Third argument should be an integer') |
|
1237 | 1228 |
except IndexError: |
1238 | 1229 |
nextlen = 0 |
1239 | 1230 |
|
1240 | 1231 |
if len(args) > 4: |
1241 |
patt = re.compile(args[4]) |
|
1232 |
try: |
|
1233 |
patt = re.compile(args[4]) |
|
1234 |
except: |
|
1235 |
raise functions.OperatorError('textwindow2s','Fourth argument must be string or compiled pattern') |
|
1242 | 1236 |
for i in xrange(len(g)-middle+1): |
1243 | 1237 |
im = i+middle |
1244 | 1238 |
mid = ' '.join(g[i:im]) |
Also available in: Unified diff
Madis update