Project

General

Profile

1
"""
2
Examples:
3

    
4
    >>> table1('''
5
    ... James   10	2
6
    ... Mark    7	3
7
    ... Lila    74	1
8
    ... ''')
9
    >>> sql("select * from table1")
10
    a     | b  | c
11
    --------------
12
    James | 10 | 2
13
    Mark  | 7  | 3
14
    Lila  | 74 | 1
15

    
16
    >>> sql("coutput '../../tests/output' split:2  mode:rcfile select hashmodarchdep(rank,2),* from (select a as name , b as age, c as rank from table1)")
17
    return_value
18
    ------------
19
    1
20
    >>> sql("unionallrcfiles file:../../tests/output")
21
    name  | age | rank
22
    ------------------
23
    Mark  | 7   | 3
24
    Lila  | 74  | 1
25
    James | 10  | 2
26

    
27
    >>> sql("coutput '../../tests/emptyoutput' split:2  mode:rcfile select hashmodarchdep(rank,2),* from (select a as name , b as age, c as rank from table1 limit 0)")
28
    return_value
29
    ------------
30
    1
31
    >>> sql("unionallrcfiles file:../../tests/emptyoutput")
32

    
33

    
34
    >>> sql("coutput '../../tests/outputsp8' split:8  mode:rcfile select hashmodarchdep(rank,8),* from (select a as name , b as age, c as rank from table1)")
35
    return_value
36
    ------------
37
    1
38
    >>> sql("unionallrcfiles file:../../tests/outputsp8")
39
    name  | age | rank
40
    ------------------
41
    James | 10  | 2
42
    Mark  | 7   | 3
43
    Lila  | 74  | 1
44
"""
45

    
46

    
47
import os.path
48
import sys
49
from vtout import SourceNtoOne
50
import functions
51
import lib.inoutparsing
52
import os
53
import marshal
54
from itertools import izip , imap
55
import itertools
56
import cPickle as cPickle
57
import struct
58
import gc
59
import cStringIO as cStringIO
60
import marshal
61
import zlib
62
from collections import OrderedDict
63
from array import array
64
import time
65
import listser
66

    
67

    
68

    
69
BLOCK_SIZE = 65536000
70
ZLIB = "zlib"
71
BZ2 = "bzip"
72
RCFILE=1
73
SDC=2
74
registered=True
75

    
76
def getSize(v):
77
    t = type(v)
78

    
79
    if t == unicode:
80
        return 52 + 4*len(v)
81

    
82
    if t in (int, float, None):
83
        return 24
84

    
85
    return 37 + len(v)
86

    
87
def outputData(diter, schema, connection, *args, **formatArgs):
88
    ### Parameter handling ###
89

    
90
    where=None
91
    mode = 'row'
92
    compression = 'zlib'
93
    level = 5
94
    if len(args)>0:
95
        where=args[0]
96
    elif 'file' in formatArgs:
97
        where=formatArgs['file']
98
    else:
99
        raise functions.OperatorError(__name__.rsplit('.')[-1],"No destination provided")
100
    if 'file' in formatArgs:
101
        del formatArgs['file']
102
    if 'mode' in formatArgs:
103
        mode = formatArgs['mode']
104
    if 'compr'  in formatArgs:
105
        if formatArgs['compr'] == "zlib":
106
            compression = ZLIB
107
        elif formatArgs['compr'] == "bz2":
108
            compression = BZ2
109
    if 'level' in formatArgs:
110
        l = formatArgs['level']
111
        if int(l)>=0 and int(l) <=9 :
112
            level = int(l)
113
    filename, ext=os.path.splitext(os.path.basename(where))
114
    fullpath=os.path.split(where)[0]
115
    if 'split' not in formatArgs:
116
        fileIter=open(where, "w+b")
117
        fastPickler = cPickle.Pickler(fileIter, 1)
118
        fastPickler.fast = 1
119
    else:
120
        fileIter = 1
121

    
122

    
123
    if mode == 'simplecol':
124
        pass
125

    
126

    
127

    
128

    
129

    
130
    if mode == 'spaconedict':
131
        colnum = len(schema)
132
        marshal.dump(schema,fileIter,2)
133
        setcol = set([])
134
        start = 0
135
        step = 65535 / colnum
136
        bsize = 0
137
        numrows = 0
138
        dictlimit = step - colnum
139
        current = [[] for _ in xrange(colnum)]
140
        prevdicts = {}
141
        count = 0
142
        while True:
143
            maxlen = 0
144
            nditer = zip(*itertools.islice(diter, 0, step))
145
            if len(nditer) != 0:
146
                count += len(nditer[0])
147
            for i,col in enumerate(nditer):
148
                current[i] += col
149
                setcol.update(col)
150
                l = len(setcol)
151
                if l > maxlen:
152
                    maxlen = l
153
            step = (65535 - maxlen) / colnum
154
            if step < 5000 or len(nditer) == 0:
155
                prev = fileIter.tell()
156
                headindex = [0 for _ in xrange(colnum+2)]
157
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
158
                headindex[i] = fileIter.tell()
159
                s = sorted(setcol)
160
                finallist =[]
161
                for val in s:
162
                    if val not in prevdicts:
163
                        finallist.append(val)
164
                prevdicts = dict(((x,y) for y,x in enumerate(s)))
165
                marshal.dump(list(finallist),fileIter,2)
166
                cur = zip(*current)
167
                for i in xrange(count):
168
                    array('H',[prevdicts[y] for y in cur[i]]).tofile(fileIter)
169
                headindex[i+1] = fileIter.tell()
170
                headindex[i+2] = count
171
                count=0
172
                fileIter.seek(prev)
173
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
174
                fileIter.seek(headindex[colnum])
175
                current = [[] for _ in xrange(colnum)]
176
                setcol = set([])
177
                gc.collect()
178
                step = 65535 / colnum
179
            if len(nditer)==0:
180
                break
181

    
182

    
183

    
184
    if mode == 'sortedspac':
185
        colnum = len(schema)
186
        marshal.dump(schema,fileIter,2)
187
        setcol = [set([]) for _ in xrange(colnum)]
188
#        compressorlistcols = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
189
#        compressorlistdicts = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
190
#        compressorlistvals = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
191
        listoflens = [0 for _ in xrange(colnum)]
192
        start = 0
193
        dictsize = 65536
194
        listofarrays = []
195
        paxcols = []
196
        indextype = 'H'
197
        step = dictsize
198
        stopstep = int(dictsize/20)
199
        bsize = 0
200
        listofnditers = []
201
        numrows=0
202
        dictlimit = step - colnum
203
        coldicts = [{} for _ in xrange(colnum)]
204
        prevsets =  [set([]) for _ in xrange(colnum)]
205
        count = 0
206
        blocknum = 0
207
        bsize = 0
208
        while True:
209
            maxlen = 0
210
            ret = False
211
            z = zip(*itertools.islice(diter, 0, step))
212

    
213
            if z==[]:
214
                ret = True
215
            else:
216
                for c in z:
217
                    bsize+=sys.getsizeof(c)
218
                listofnditers.append(z)
219

    
220
                if len(listofnditers[-1])!=0:
221
                    count += len(listofnditers[-1][0])
222
                else:
223
                    ret = 1
224

    
225
                for i,col in enumerate(listofnditers[-1]):
226
                    if i not in paxcols:
227
                        setcol[i].update(col)
228
                        l = len(setcol[i])
229
                        if l > maxlen:
230
                            maxlen = l
231

    
232
            step = dictsize - maxlen
233
            if step < stopstep or ret or bsize>300000000:
234
                bsize=0
235
                prev = fileIter.tell() + 4*(colnum+2)
236
                output = cStringIO.StringIO()
237
                headindex = [0 for _ in xrange(colnum+2)]
238

    
239
                if blocknum == 0:
240
                    for i in xrange(colnum):
241
                        headindex[i] = output.tell() + prev
242
#                        s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
243
#                        if (dictsize*2+len(marshal.dumps(setcol[i],2))>len(marshal.dumps(s,2))):
244
#                            paxcols.append(i)
245
#                            output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
246
                        if (len(setcol[i])>55000):
247
                            paxcols.append(i)
248
                            s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
249
                            #output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
250
                            listofarrays.append(s)
251
                        else:
252
                            prevsets[i] = set(setcol[i]).copy()
253
                            s = sorted(setcol[i])
254
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
255
                            coldict = coldicts[i]
256
                            if len(s)<256:
257
                                indextype='B'
258
                            else:
259
                                indextype='H'
260
                            listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
261
                            #output.write(compressorlistcols[i].compress(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl] ).tostring()))
262
                else:
263
                    for i in xrange(colnum):
264
                        headindex[i] = output.tell() + prev
265
                        if i in paxcols:
266
                            s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
267
                            #output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
268
                            listofarrays.append(s)
269
                        else:
270
                            difnew = list(setcol[i]-prevsets[i])
271
                            difold = list(prevsets[i]-setcol[i])
272
                            prevsets[i].intersection_update(setcol[i])
273
                            prevsets[i].update(difnew)
274
                            prevsets[i].update(difold[:(dictsize-len(prevsets[i]))])
275
                            towrite = {}
276
                            le = len(difold)
277
                            si = len(coldicts[i])
278
                            for l,j in enumerate(difnew):
279
                                if l<le:
280
                                    towrite[j] = coldicts[i][difold[l]]
281
                                else:
282
                                    towrite[j] = si
283
                                    si+=1
284
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
285
                            coldict = coldicts[i]
286
                            if len(prevsets[i]) != 0 :
287
                                if len(prevsets[i])<256:
288
                                    indextype='B'
289
                                else:
290
                                    indextype='H'
291
                                output.write(zlib.compress(marshal.dumps(towrite.keys(),2)))
292
                                output.write(zlib.compress(array(indextype,towrite.values()).tostring()))
293
                            listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
294
                        #output.write(compressorlistcols[i].compress(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl] ).tostring()))
295
#
296
                z = zip(*listofarrays)
297
                ##edw elegxw prwta oti uparxei kapoio paxcol
298
                z.sort(key=lambda z: z[paxcols[0]])
299
                li = zip(*z)
300

    
301

    
302
#                for i,l in enumerate(listofarrays):
303
#                    if i in paxcols:
304
#                        output.write(zlib.compress(marshal.dumps(sorted(l),2)))
305
#                    else:
306
#                        output.write(zlib.compress(marshal.dumps(l)))
307

    
308
                for x in li:
309
                       output.write(zlib.compress(marshal.dumps(sorted(x),2)))
310

    
311

    
312
                headindex[i+1] = output.tell()+ prev
313
                headindex[i+2] = count
314

    
315
                count=0
316
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
317
                fileIter.write(output.getvalue())
318
                listoflens = [0 for _ in xrange(colnum)]
319
                for s in setcol:
320
                    s.clear()
321
                listofarrays = []
322
                listofnditers = []
323
                gc.collect()
324
                step = dictsize
325
                blocknum+=1
326

    
327
            if ret:
328
                break
329

    
330

    
331

    
332
    if mode == 'sspac':
333
        colnum = len(schema)
334
        marshal.dump(schema,fileIter,2)
335
        setcol = [set([]) for _ in xrange(colnum)]
336
#        compressorlistcols = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
337
#        compressorlistdicts = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
338
#        compressorlistvals = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
339
        listoflens = [0 for _ in xrange(colnum)]
340
        start = 0
341
        dictsize = 65536
342
        #listofarrays = []
343
        paxcols = []
344
        indextype = 'H'
345
        step = dictsize
346
        stopstep = int(dictsize/20)
347
        bsize = 0
348
        listofnditers = []
349
        numrows=0
350
        dictlimit = step - colnum
351
        coldicts = [{} for _ in xrange(colnum)]
352
        prevsets =  [set([]) for _ in xrange(colnum)]
353
        count = 0
354
        blocknum = 0
355
        bsize = 0
356

    
357
        while True:
358
            maxlen = 0
359
            ret = False
360
            z = zip(*itertools.islice(diter, 0, step))
361

    
362
            if z==[]:
363
                ret = True
364
            else:
365
                for c in z:
366
                    bsize+=sys.getsizeof(c)
367
                listofnditers.append(z)
368

    
369
                if len(listofnditers[-1])!=0:
370
                    count += len(listofnditers[-1][0])
371
                else:
372
                    ret = 1
373

    
374
                for i,col in enumerate(listofnditers[-1]):
375
                    if i not in paxcols:
376
                        setcol[i].update(col)
377
                        l = len(setcol[i])
378
                        if l > maxlen:
379
                            maxlen = l
380

    
381
            step = dictsize - maxlen
382
            if step < stopstep or ret or bsize>300000000:
383
                bsize=0
384
                prev = fileIter.tell() + 4*(colnum+2)
385
                output = cStringIO.StringIO()
386
                headindex = [0 for _ in xrange(colnum+2)]
387
                listofvals = []
388
                for j in xrange(colnum):
389
                     listofvals.append([val for subl in [x[j] for x in listofnditers] for val in subl])
390
                listofvals1 = zip(*listofvals)
391
                listofvals1.sort(key=lambda listofvals1: listofvals1[10])
392
                listofvals = zip(*listofvals1)
393
                for s in setcol:
394
                    s.clear()
395
                for l in xrange(colnum):
396
                    setcol[l] = set(listofvals[l])
397

    
398
                print listofvals[10]
399
                if blocknum == 0:
400
                    for i in xrange(colnum):
401
                        headindex[i] = output.tell() + prev
402
#                        s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
403
#                        if (dictsize*2+len(marshal.dumps(setcol[i],2))>len(marshal.dumps(s,2))):
404
#                            paxcols.append(i)
405
#                            output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
406
                        if (len(setcol[i])>55000):
407
                            paxcols.append(i)
408
                            #s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
409
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
410
                            #listofarrays.append(s)
411
                        else:
412
                            prevsets[i] = set(setcol[i]).copy()
413
                            s = sorted(setcol[i])
414
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
415
                            coldict = coldicts[i]
416
                            if len(s)<256:
417
                                indextype='B'
418
                            else:
419
                                indextype='H'
420
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
421
                            output.write(zlib.compress(marshal.dumps(s,2)))
422
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i] ]).tostring()))
423
                else:
424

    
425
                    for i in xrange(colnum):
426
                        headindex[i] = output.tell() + prev
427
                        if i in paxcols:
428
                            #s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
429
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
430
                            #listofarrays.append(s)
431
                        else:
432
                            difnew = list(setcol[i]-prevsets[i])
433
                            difold = list(prevsets[i]-setcol[i])
434
                            prevsets[i].intersection_update(setcol[i])
435
                            prevsets[i].update(difnew)
436
                            prevsets[i].update(difold[:(dictsize-len(prevsets[i]))])
437
                            towrite = {}
438
                            le = len(difold)
439
                            si = len(coldicts[i])
440
                            for l,j in enumerate(difnew):
441
                                if l<le:
442
                                    towrite[j] = coldicts[i][difold[l]]
443
                                else:
444
                                    towrite[j] = si
445
                                    si+=1
446
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
447
                            coldict = coldicts[i]
448
                            if len(prevsets[i]) != 0 :
449
                                if len(prevsets[i])<256:
450
                                    indextype='B'
451
                                else:
452
                                    indextype='H'
453
                                output.write(zlib.compress(marshal.dumps(towrite.keys(),2)))
454
                                output.write(zlib.compress(array(indextype,towrite.values()).tostring()))
455
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
456
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
457

    
458
#                for i,l in enumerate(listofarrays):
459
#                    if i in paxcols:
460
#                        output.write(zlib.compress(marshal.dumps(sorted(l),2)))
461
#                    else:
462
#                        output.write(zlib.compress(l.tostring()))
463

    
464

    
465

    
466
                headindex[i+1] = output.tell()+ prev
467
                headindex[i+2] = count
468

    
469
                count=0
470
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
471
                fileIter.write(output.getvalue())
472
                listoflens = [0 for _ in xrange(colnum)]
473
                for s in setcol:
474
                    s.clear()
475
                #listofarrays = []
476
                listofnditers = []
477
                listofvals=[]
478
                gc.collect()
479
                step = dictsize
480
                blocknum+=1
481

    
482
            if ret:
483
                break
484
            
485
    if mode == 'sortspac':
486
        colnum = len(schema)
487
        marshal.dump(schema,fileIter,2)
488
        setcol = [set([]) for _ in xrange(colnum)]
489
#        compressorlistcols = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
490
#        compressorlistdicts = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
491
#        compressorlistvals = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
492
        listoflens = [0 for _ in xrange(colnum)]
493
        start = 0
494
        dictsize = 65536
495
        #listofarrays = []
496
        paxcols = []
497
        indextype = 'H'
498
        step = dictsize
499
        stopstep = int(dictsize/20)
500
        bsize = 0
501
        listofnditers = []
502
        numrows=0
503
        dictlimit = step - colnum
504
        coldicts = [{} for _ in xrange(colnum)]
505
        prevsets =  [set([]) for _ in xrange(colnum)]
506
        count = 0
507
        blocknum = 0
508
        bsize = 0
509
        while True:
510
            maxlen = 0
511
            ret = False
512
            z = zip(*itertools.islice(diter, 0, step))
513

    
514
            if z==[]:
515
                ret = True
516
            else:
517
                for c in z:
518
                    bsize+=sys.getsizeof(c)
519
                listofnditers.append(z)
520

    
521
                if len(listofnditers[-1])!=0:
522
                    count += len(listofnditers[-1][0])
523
                else:
524
                    ret = 1
525

    
526
                for i,col in enumerate(listofnditers[-1]):
527
                    if i not in paxcols:
528
                        setcol[i].update(col)
529
                        l = len(setcol[i])
530
                        if l > maxlen:
531
                            maxlen = l
532

    
533
            step = dictsize - maxlen
534
           
535

    
536

    
537

    
538
            if step < stopstep or ret or bsize>300000000:
539
                bsize=0
540
                prev = fileIter.tell() + 4*(colnum+2)
541
                output = cStringIO.StringIO()
542
                headindex = [0 for _ in xrange(colnum+2)]
543

    
544
                listofvals = []
545
                for j in xrange(colnum):
546
                    listofvals.append([val for subl in [x[j] for x in listofnditers] for val in subl])
547
                listofvals1 = zip(*listofvals)
548
                listofvals1.sort(key=lambda listofvals1: listofvals1[5])
549
                listofvals = zip(*listofvals1)
550
                for s in setcol:
551
                    s.clear()
552
                for l in xrange(colnum):
553
                    setcol[l] = set(listofvals[l])
554

    
555

    
556
                if blocknum == 0:
557

    
558
                    for i in xrange(colnum):
559
                        headindex[i] = output.tell() + prev
560
#                        s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
561
#                        if (dictsize*2+len(marshal.dumps(setcol[i],2))>len(marshal.dumps(s,2))):
562
#                            paxcols.append(i)
563
#                            output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
564
                        if (len(setcol[i])>55000):
565
                            paxcols.append(i)
566
                            print i
567
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
568
                            #listofarrays.append(s)
569
                        else:
570
                            prevsets[i] = set(setcol[i]).copy()
571
                            s = sorted(setcol[i])
572
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
573
                            coldict = coldicts[i]
574
                            if len(s)<256:
575
                                indextype='B'
576
                            else:
577
                                indextype='H'
578
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
579
                            output.write(zlib.compress(marshal.dumps(s,2)))
580
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
581
                            #l = [coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]
582
                            #z = [coldict[val] for val in listofvals[i]]
583
                            #print l
584
                            #print i
585
                            #print len(z)
586

    
587
                else:
588
#                    listofnditers1 = zip(*listofnditers)
589
#                    listofnditers1.sort(key=lambda listofnditers1: listofnditers1[paxcols[0]])
590
#                    listofnditers = zip(*listofnditers1)
591
                    for i in xrange(colnum):
592
                        headindex[i] = output.tell() + prev
593
                        if i in paxcols:
594
                            s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
595
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
596
                            #listofarrays.append(s)
597
                        else:
598
                            difnew = list(setcol[i]-prevsets[i])
599
                            difold = list(prevsets[i]-setcol[i])
600
                            prevsets[i].intersection_update(setcol[i])
601
                            prevsets[i].update(difnew)
602
                            prevsets[i].update(difold[:(dictsize-len(prevsets[i]))])
603
                            towrite = {}
604
                            le = len(difold)
605
                            si = len(coldicts[i])
606
                            for l,j in enumerate(difnew):
607
                                if l<le:
608
                                    towrite[j] = coldicts[i][difold[l]]
609
                                else:
610
                                    towrite[j] = si
611
                                    si+=1
612
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
613
                            coldict = coldicts[i]
614
                            if len(prevsets[i]) != 0 :
615
                                if len(prevsets[i])<256:
616
                                    indextype='B'
617
                                else:
618
                                    indextype='H'
619
                                output.write(zlib.compress(marshal.dumps(towrite.keys(),2)))
620
                                output.write(zlib.compress(array(indextype,towrite.values()).tostring()))
621
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
622
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
623

    
624
#                for i,l in enumerate(listofarrays):
625
#                    if i in paxcols:
626
#                        output.write(zlib.compress(marshal.dumps(sorted(l),2)))
627
#                    else:
628
#                        output.write(zlib.compress(l.tostring()))
629

    
630

    
631

    
632
                headindex[i+1] = output.tell()+ prev
633
                headindex[i+2] = count
634

    
635
                count=0
636
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
637
                fileIter.write(output.getvalue())
638
                listoflens = [0 for _ in xrange(colnum)]
639
                for s in setcol:
640
                    s.clear()
641
                #listofarrays = []
642
                listofnditers = []
643
                gc.collect()
644
                step = dictsize
645
                blocknum+=1
646

    
647
            if ret:
648
                break
649

    
650

    
651

    
652
    if mode == 'corelspac':
653
        colnum = len(schema)
654
        marshal.dump(schema,fileIter,2)
655
        setcol = [set([]) for _ in xrange(colnum)]
656
#        compressorlistcols = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
657
#        compressorlistdicts = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
658
#        compressorlistvals = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
659
        listoflens = [0 for _ in xrange(colnum)]
660
        start = 0
661
        dictsize = 65536
662
        #listofarrays = []
663
        paxcols = []
664
        indextype = 'H'
665
        step = dictsize
666
        stopstep = int(dictsize/20)
667
        bsize = 0
668
        listofnditers = []
669
        numrows=0
670
        dictlimit = step - colnum
671
        coldicts = [{} for _ in xrange(colnum)]
672
        prevsets =  [set([]) for _ in xrange(colnum)]
673
        corelset = {}
674
        correlatedcols = [{} for _ in xrange(colnum**colnum)]
675
        correlated = set()
676
        firstgroup = set()
677
        count = 0
678
        blocknum = 0
679
        bsize = 0
680

    
681
        def allgroup(expansions, n=0, groups = []):
682
            expgroup = [expansions[n]]
683
            if n == len(expansions) - 1:
684
                yield groups + [expgroup]
685
                for i in xrange(len(groups)):
686
                    tmp = groups[i]
687
                    groups[i] = tmp + expgroup
688
                    yield groups
689
                    groups[i] = tmp
690
            else:
691
                for g in allgroup(expansions, n+1, groups + [expgroup]):
692
                    yield g
693
                for i in xrange(len(groups)):
694
                    tmp = groups[i]
695
                    groups[i] = tmp + expgroup
696
                    for g in allgroup(expansions, n + 1, groups):
697
                        yield g
698
                    groups[i] = tmp
699

    
700
        for i in allgroup(range(colnum)):
701
            print i
702

    
703

    
704
        while True:
705
            maxlen = 0
706
            ret = False
707
            z = zip(*itertools.islice(diter, 0, step))
708

    
709
            if z==[]:
710
                ret = True
711
            else:
712
                for c in z:
713
                    bsize+=sys.getsizeof(c)
714
                listofnditers.append(z)
715

    
716
                if len(listofnditers[-1])!=0:
717
                    count += len(listofnditers[-1][0])
718
                else:
719
                    ret = 1
720

    
721
                for i,col in enumerate(listofnditers[-1]):
722
                    if i not in paxcols:
723
                        setcol[i].update(col)
724
                        l = len(setcol[i])
725
                        if l > maxlen:
726
                            maxlen = l
727

    
728
            step = dictsize - maxlen
729
            if step < stopstep or ret or bsize>300000000:
730
                bsize=0
731
                prev = fileIter.tell() + 4*(colnum+2)
732
                output = cStringIO.StringIO()
733
                headindex = [0 for _ in xrange(colnum+2)]
734
                listofvals = []
735
                for j in xrange(colnum):
736
                    listofvals.append([val for subl in [x[j] for x in listofnditers] for val in subl])
737

    
738
                if blocknum == 0:
739
                    sortdict = {}
740
                    for i in xrange(colnum):
741
                        if (len(setcol[i])>55000):
742
                            print i
743
                            paxcols.append(i)
744
                        sortdict[i] = len(setcol[i])
745
#                    le = sorted(sortdict, key=sortdict.get)
746
#                    ginomeno = 1
747
#
748
#                    for i in le:
749
#                        ginomeno = ginomeno * sortdict[i]
750
#                        if ginomeno > 256:
751
#                            break
752
#                        else:
753
#                            pass #firstgroup.add(i)
754
#                    num = colnum-len(paxcols)-len(firstgroup)
755
#                    if num%2 != 0 and len(firstgroup)!=0:
756
#                        firstgroup.remove(le[0])
757
#
758
#                    for i in xrange(colnum-1):
759
#                        for j in xrange(i+1,colnum):
760
#                            if  i not in correlated and j not in correlated and i not in firstgroup and j not in firstgroup:
761
#                                l = zip(listofvals[i],listofvals[j])
762
#                                #print (float(len(set(l)))/len(set(listofvals[i]))+len(set(listofvals[j]))) , i , j
763
#                                if len(set(l)) < 10000:
764
#                                    correlatedcols[i] = j
765
#                                    correlated.add(i)
766
#                                    correlated.add(j)
767
#                                    i+=1
768
#                    print correlatedcols
769
                    check = 0
770
                    for i in xrange(colnum):
771
                        headindex[i] = output.tell() + prev
772
#                        s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
773
#                        if (dictsize*2+len(marshal.dumps(setcol[i],2))>len(marshal.dumps(s,2))):
774
#                            paxcols.append(i)
775
#                            output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
776
                        if i in paxcols:
777
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
778
                            #listofarrays.append(s)
779
                        elif i in firstgroup and check == 0:
780

    
781
                            check=1
782
                        elif i in correlatedcols:
783
                                corellated = zip(listofvals[i],*listofvals[correlatedcols[i]])
784
                                corelset[i] = set(corellated)
785
                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset[i])))
786
                                coldict = coldicts[i]
787
                                s = zip(*corelset[i])
788
                                output.write(zlib.compress(marshal.dumps(s[0],2)))
789
                                output.write(zlib.compress(marshal.dumps(s[1],2)))
790
                                if len(corelset[i])<256:
791
                                    indextype='B'
792
                                else:
793
                                    indextype='H'
794
                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset[i]] ).tostring()))
795

    
796
                        elif i in correlated or (i in firstgroup and check==1):
797
                                    pass
798
                        else:
799
                            prevsets[i] = set(setcol[i]).copy()
800
                            s = sorted(setcol[i])
801
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
802
                            coldict = coldicts[i]
803
                            if len(s)<256:
804
                                indextype='B'
805
                            else:
806
                                indextype='H'
807
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
808
                            output.write(zlib.compress(marshal.dumps(s,2)))
809
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
810
                else:
811
#                    listofnditers1 = zip(*listofnditers)
812
#                    listofnditers1.sort(key=lambda listofnditers1: listofnditers1[paxcols[0]])
813
#                    listofnditers = zip(*listofnditers1)
814
                    check=0
815
                    for i in xrange(colnum):
816
                        headindex[i] = output.tell() + prev
817
                        if i in paxcols:
818
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
819
                            #listofarrays.append(s)
820
                        elif i in firstgroup and check == 0:
821
                            check=1
822
                        elif i in correlatedcols:
823
                                corellated = zip(listofvals[i],listofvals[correlatedcols[i]])
824
                                corelset1 = set(corellated)
825
                                difnew = list(corelset1-corelset[i])
826
                                difold = list(corelset[i]-corelset1)
827
                                corelset[i].intersection_update(corelset1)
828
                                corelset[i].update(difnew)
829
                                corelset[i].update(difold)
830
                                towrite = {}
831
                                le = len(difold)
832
                                si = len(coldicts[i])
833
                                for l,j in enumerate(difnew):
834
                                    if l<le:
835
                                        towrite[j] = coldicts[i][difold[l]]
836
                                    else:
837
                                        towrite[j] = si
838
                                        si+=1
839

    
840
                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset[i])))
841
                                coldict = coldicts[i]
842
                                s = zip(*towrite.keys())
843
                                if len(s)>0:
844
                                    output.write(zlib.compress(marshal.dumps(s[0],2)))
845
                                    output.write(zlib.compress(marshal.dumps(s[1],2)))
846
                                if len(corelset[i])<256:
847
                                    indextype='B'
848
                                else:
849
                                    indextype='H'
850
                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset[i]] ).tostring()))
851
                        elif i in correlatedcols.values() or (i in firstgroup and check==1):
852
                                    pass
853
                        else:
854
                            difnew = list(setcol[i]-prevsets[i])
855
                            difold = list(prevsets[i]-setcol[i])
856
                            prevsets[i].intersection_update(setcol[i])
857
                            prevsets[i].update(difnew)
858
                            prevsets[i].update(difold[:(dictsize-len(prevsets[i]))])
859
                            towrite = {}
860
                            le = len(difold)
861
                            si = len(coldicts[i])
862
                            for l,j in enumerate(difnew):
863
                                if l<le:
864
                                    towrite[j] = coldicts[i][difold[l]]
865
                                else:
866
                                    towrite[j] = si
867
                                    si+=1
868
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
869
                            coldict = coldicts[i]
870
                            if len(prevsets[i]) != 0 :
871
                                if len(prevsets[i])<256:
872
                                    indextype='B'
873
                                else:
874
                                    indextype='H'
875
                                output.write(zlib.compress(marshal.dumps(towrite.keys(),2)))
876
                                output.write(zlib.compress(array(indextype,towrite.values()).tostring()))
877
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
878
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
879

    
880
#                for i,l in enumerate(listofarrays):
881
#                    if i in paxcols:
882
#                        output.write(zlib.compress(marshal.dumps(sorted(l),2)))
883
#                    else:
884
#                        output.write(zlib.compress(l.tostring()))
885

    
886

    
887

    
888
                headindex[i+1] = output.tell()+ prev
889
                headindex[i+2] = count
890

    
891
                count=0
892
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
893
                fileIter.write(output.getvalue())
894
                listoflens = [0 for _ in xrange(colnum)]
895
                for s in setcol:
896
                    s.clear()
897
                #listofarrays = []
898
                listofnditers = []
899
                gc.collect()
900
                step = dictsize
901
                blocknum+=1
902

    
903
            if ret:
904
                break
905

    
906

    
907

    
908

    
909
   
910
    if mode == 'corelspacfinal':
911
        colnum = len(schema)
912
        marshal.dump(schema,fileIter,2)
913
        setcol = [set([]) for _ in xrange(colnum)]
914
#        compressorlistcols = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
915
#        compressorlistdicts = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
916
#        compressorlistvals = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
917
        firstgroup = set()
918
        correlatedcols = {}
919
        correlated = set()
920
        listoflens = [0 for _ in xrange(colnum)]
921
        start = 0
922
        dictsize = 65536
923
        paxcols = []
924
        indextype = 'H'
925
        step = dictsize
926
        stopstep = int(dictsize/20)
927
        bsize = 0
928
        listofnditers = []
929
        numrows=0
930
        dictlimit = step - colnum
931
        coldicts = [{} for _ in xrange(colnum)]
932
        prevsets =  [[] for _ in xrange(colnum)]
933
        corelset = {}
934
        count = 0
935
        blocknum = 0
936
        bsize = 0
937
        while True:
938
            maxlen = 0
939
            ret = False
940
            z = zip(*itertools.islice(diter, 0, step))
941

    
942
            if z==[]:
943
                ret = True
944
            else:
945
                for c in z:
946
                    bsize+=sys.getsizeof(c)
947
                listofnditers.append(z)
948

    
949
                if len(listofnditers[-1])!=0:
950
                    count += len(listofnditers[-1][0])
951
                else:
952
                    ret = 1
953

    
954
                for i,col in enumerate(listofnditers[-1]):
955
                    if i not in paxcols:
956
                        setcol[i].update(col)
957
                        l = len(setcol[i])
958
                        if l > maxlen:
959
                            maxlen = l
960

    
961
            step = dictsize - maxlen
962
            if step < stopstep or ret or bsize>300000000:
963
                bsize=0
964
                prev = fileIter.tell() + 8*(colnum+2)
965
                output = cStringIO.StringIO()
966
                headindex = [0 for _ in xrange(colnum+2)]
967
                listofvals = []
968
                for j in xrange(colnum):
969
                    listofvals.append([val for subl in [x[j] for x in listofnditers] for val in subl])
970

    
971
                if blocknum == 0:
972
                    sortdict = {}
973
                    for i in xrange(colnum):
974
                        if (len(setcol[i])>55000):
975
                            paxcols.append(i)
976
                        sortdict[i] = len(setcol[i])
977
                    le = sorted(sortdict, key=sortdict.get)
978
                    ginomeno = 1
979

    
980
                    print paxcols
981
                    for i in xrange(colnum-1):
982
                        for j in xrange(i+1,colnum):
983
                            if  i not in correlated and j not in correlated :
984
                                l = zip(listofvals[i],listofvals[j])
985
                                #print (float(len(set(l)))/len(set(listofvals[i]))+len(set(listofvals[j]))) , i , j
986
                                if len(set(l)) < 10000:
987
                                    correlatedcols[i] = j
988
                                    correlated.add(i)
989
                                    correlated.add(j)
990
                                    i+=1
991
                    print correlatedcols
992

    
993
                    for i in xrange(colnum):
994
                        headindex[i] = output.tell() + prev
995

    
996
#                        s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
997
#                        if (dictsize*2+len(marshal.dumps(setcol[i],2))>len(marshal.dumps(s,2))):
998
#                            paxcols.append(i)
999
#                            output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
1000
                        if (len(setcol[i])>55000):
1001
                            paxcols.append(i)
1002
                            l = [0 for _ in xrange(3)]
1003
                            t = output.tell()
1004
                            output.write(struct.pack('L'*len(l), *l))
1005
                            sortedlist = sorted(listofvals[i])
1006
                            output.write(zlib.compress(marshal.dumps(sortedlist,2)))
1007
                            indextype = 'H'
1008
                            indexeslist = []
1009
                            for value in sortedlist:
1010
                                indexeslist.append(listofvals[i].index(value))
1011
                            output.write(zlib.compress(array(indextype,indexeslist).tostring()))
1012
                            l[0] = output.tell()
1013
                            output.seek(t)
1014
                            output.write(struct.pack('L'*len(l), *l))
1015
                            output.seek(l[0])
1016

    
1017
                        elif i in correlatedcols:
1018
                                corellated = zip(listofvals[i],listofvals[correlatedcols[i]])
1019
                                corelset[i] = set(corellated)
1020
                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset[i])))
1021
                                coldict = coldicts[i]
1022
                                s = zip(*corelset[i])
1023
                                output.write(zlib.compress(marshal.dumps(s[0],2)))
1024
                                output.write(zlib.compress(marshal.dumps(s[1],2)))
1025
                                if len(corelset[i])<256:
1026
                                    indextype='B'
1027
                                else:
1028
                                    indextype='H'
1029
                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset[i]] ).tostring()))
1030

    
1031
                        elif i in correlated:
1032
                                    pass
1033
                            #listofarrays.append(s)
1034
#                        elif i==8 or i==9:
1035
#                            if i==8:
1036
#                                pass
1037
#                            if i==9:
1038
#                                corellated = zip(listofvals[8],listofvals[9])
1039
#                                corelset = set(corellated)
1040
#                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset)))
1041
#                                coldict = coldicts[i]
1042
#                                s = zip(*corelset)
1043
#                                output.write(zlib.compress(marshal.dumps(s[0],2)))
1044
#                                output.write(zlib.compress(marshal.dumps(s[1],2)))
1045
#                                indextype='B'
1046
#                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset] ).tostring()))
1047
                        else:
1048
                            prevsets[i] = list(set(setcol[i]).copy())
1049
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
1050
                            coldict = coldicts[i]
1051
                            if len(prevsets[i])<256:
1052
                                indextype='B'
1053
                            else:
1054
                                indextype='H'
1055
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
1056
                            l = [0 for _ in xrange(3)]
1057
                            t = output.tell()
1058
                            output.write(struct.pack('L'*len(l), *l))
1059
                            output.write(zlib.compress(marshal.dumps(prevsets[i],2)))
1060
                            l[0] = output.tell()
1061
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
1062
                            l[1] = output.tell()
1063
                            output.seek(t)
1064
                            output.write(struct.pack('L'*len(l), *l))
1065
                            output.seek(l[1])
1066
                            print l
1067
                else:
1068
#                    listofnditers1 = zip(*listofnditers)
1069
#                    listofnditers1.sort(key=lambda listofnditers1: listofnditers1[paxcols[0]])
1070
#                    listofnditers = zip(*listofnditers1)
1071
                    check = 0
1072
                    for i in xrange(colnum):
1073
                        headindex[i] = output.tell() + prev
1074
                        if i in paxcols:
1075
                            l = [0 for _ in xrange(3)]
1076
                            t = output.tell()
1077
                            output.write(struct.pack('L'*len(l), *l))
1078
                            sortedlist = sorted(listofvals[i])
1079
                            output.write(zlib.compress(marshal.dumps(sortedlist,2)))
1080
                            indextype = 'H'
1081
                            indexeslist = []
1082
                            for value in sortedlist:
1083
                                indexeslist.append(listofvals[i].index(value))
1084
                            output.write(zlib.compress(array(indextype,indexeslist).tostring()))
1085
                            l[0] = output.tell()
1086
                            output.seek(t)
1087
                            output.write(struct.pack('L'*len(l), *l))
1088
                            output.seek(l[0])
1089
                            #listofarrays.append(s)
1090
#                        elif i==8 or i==9:
1091
#                            if i==8:
1092
#                                pass
1093
#                            if i==9:
1094
#                                corellated = zip(listofvals[8],listofvals[9])
1095
#                                corelset1 = set(corellated)
1096
#                                difnew = list(corelset1-corelset)
1097
#                                difold = list(corelset-corelset1)
1098
#                                corelset.intersection_update(corelset1)
1099
#                                corelset.update(difnew)
1100
#                                corelset.update(difold)
1101
#                                towrite = {}
1102
#                                le = len(difold)
1103
#                                si = len(coldicts[i])
1104
#                                for l,j in enumerate(difnew):
1105
#                                    if l<le:
1106
#                                        towrite[j] = coldicts[i][difold[l]]
1107
#                                    else:
1108
#                                        towrite[j] = si
1109
#                                        si+=1
1110
#
1111
#                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset)))
1112
#                                coldict = coldicts[i]
1113
#                                s = zip(*towrite.keys())
1114
#                                if len(s)>0:
1115
#                                    output.write(zlib.compress(marshal.dumps(s[0],2)))
1116
#                                    output.write(zlib.compress(marshal.dumps(s[1],2)))
1117
#                                indextype='B'
1118
#                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset] ).tostring()))
1119
                        elif i in correlatedcols:
1120
                                corellated = zip(listofvals[i],listofvals[correlatedcols[i]])
1121
                                corelset1 = set(corellated)
1122
                                difnew = list(corelset1-corelset[i])
1123
                                difold = list(corelset[i]-corelset1)
1124
                                corelset[i].intersection_update(corelset1)
1125
                                corelset[i].update(difnew)
1126
                                corelset[i].update(difold)
1127
                                towrite = {}
1128
                                le = len(difold)
1129
                                si = len(coldicts[i])
1130
                                for l,j in enumerate(difnew):
1131
                                    if l<le:
1132
                                        towrite[j] = coldicts[i][difold[l]]
1133
                                    else:
1134
                                        towrite[j] = si
1135
                                        si+=1
1136

    
1137
                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset[i])))
1138
                                coldict = coldicts[i]
1139
                                s = zip(*towrite.keys())
1140
                                if len(s)>0:
1141
                                    output.write(zlib.compress(marshal.dumps(s[0],2)))
1142
                                    output.write(zlib.compress(marshal.dumps(s[1],2)))
1143
                                if len(corelset[i])<256:
1144
                                    indextype='B'
1145
                                else:
1146
                                    indextype='H'
1147
                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset[i]] ).tostring()))
1148
                        elif i in correlated :
1149
                                    pass
1150
                        else:
1151
                            difnew = list(setcol[i]-set(prevsets[i]))
1152
                            difold = list(set(prevsets[i])-setcol[i])
1153
                            s = []
1154

    
1155
                            s = list(prevsets[i]) + difnew
1156
                            d = 0
1157
                            while len(s)>dictsize:
1158
                                s.remove[difold[d]]
1159
                                d+=1
1160
                            prevsets[i] = s
1161

    
1162

    
1163

    
1164

    
1165
                            towrite = {}
1166
                            #le = len(difold)
1167
                            si = len(coldicts[i])
1168
                            for ki,j in enumerate(difnew):
1169
                                towrite[j] = si
1170
                                si+=1
1171

    
1172
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
1173
                            coldict = coldicts[i]
1174
                            l = [0 for _ in xrange(3)]
1175
                            t = output.tell()
1176
                            output.write(struct.pack('L'*len(l), *l))
1177
                            if len(prevsets[i]) != 0 :
1178
                                if len(prevsets[i])<256:
1179
                                    indextype='B'
1180
                                else:
1181
                                    indextype='H'
1182

    
1183
                                output.write(zlib.compress(marshal.dumps(towrite.keys(),2)))
1184
                                l[0] = output.tell()
1185
                                output.write(zlib.compress(array(indextype,towrite.values()).tostring()))
1186
                                l[1] = output.tell()
1187
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
1188
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
1189
                            l[2] = output.tell()
1190
                            output.seek(t)
1191
                            output.write(struct.pack('L'*len(l), *l))
1192
                            output.seek(l[2])
1193
#                for i,l in enumerate(listofarrays):
1194
#                    if i in paxcols:
1195
#                        output.write(zlib.compress(marshal.dumps(sorted(l),2)))
1196
#                    else:
1197
#                        output.write(zlib.compress(l.tostring()))
1198

    
1199

    
1200

    
1201
                headindex[i+1] = output.tell()+ prev
1202
                headindex[i+2] = count
1203

    
1204
                count=0
1205
                output.flush()
1206
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
1207
                print 'lala' + str(fileIter.tell())
1208
                fileIter.write(output.getvalue())
1209
                listoflens = [0 for _ in xrange(colnum)]
1210
                for s in setcol:
1211
                    s.clear()
1212
                #listofarrays = []
1213
                listofnditers = []
1214
                gc.collect()
1215
                step = dictsize
1216
                blocknum+=1
1217

    
1218
            if ret:
1219
                fileIter.close()
1220
                break
1221

    
1222
    if mode == 'spactest':
1223
        colnum = len(schema)
1224
        marshal.dump(schema,fileIter,2)
1225
        setcol = [set([]) for _ in xrange(colnum)]
1226
#        compressorlistcols = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
1227
#        compressorlistdicts = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
1228
#        compressorlistvals = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
1229
        listoflens = [0 for _ in xrange(colnum)]
1230
        start = 0
1231
        dictsize = 65536
1232
        #listofarrays = []
1233
        paxcols = []
1234
        indextype = 'H'
1235
        step = dictsize
1236
        stopstep = int(dictsize/20)
1237
        bsize = 0
1238
        listofnditers = []
1239
        numrows=0
1240
        dictlimit = step - colnum
1241
        coldicts = [{} for _ in xrange(colnum)]
1242
        prevsets =  [set([]) for _ in xrange(colnum)]
1243
        corelset = {}
1244
        correlatedcols = {}
1245
        correlated = set()
1246
        firstgroup = set()
1247
        count = 0
1248
        blocknum = 0
1249
        bsize = 0
1250
        while True:
1251
            maxlen = 0
1252
            ret = False
1253
            z = zip(*itertools.islice(diter, 0, step))
1254

    
1255
            if z==[]:
1256
                ret = True
1257
            else:
1258
                for c in z:
1259
                    bsize+=sys.getsizeof(c)
1260
                listofnditers.append(z)
1261

    
1262
                if len(listofnditers[-1])!=0:
1263
                    count += len(listofnditers[-1][0])
1264
                else:
1265
                    ret = 1
1266

    
1267
                for i,col in enumerate(listofnditers[-1]):
1268
                    if i not in paxcols:
1269
                        setcol[i].update(col)
1270
                        l = len(setcol[i])
1271
                        if l > maxlen:
1272
                            maxlen = l
1273

    
1274
            step = dictsize - maxlen
1275
            if step < stopstep or ret or bsize>300000000:
1276
                bsize=0
1277
                prev = fileIter.tell() + 4*(colnum+2)
1278
                output = cStringIO.StringIO()
1279
                headindex = [0 for _ in xrange(colnum+2)]
1280
                listofvals = []
1281
                for j in xrange(colnum):
1282
                    listofvals.append([val for subl in [x[j] for x in listofnditers] for val in subl])
1283

    
1284
                if blocknum == 0:
1285

    
1286

    
1287
                    sortdict = {}
1288
                    for i in xrange(colnum):
1289
                        if (len(setcol[i])>55000):
1290
                            paxcols.append(i)
1291
                        sortdict[i] = len(setcol[i])
1292

    
1293
                    check = 0
1294
                    for i in xrange(colnum):
1295
                        headindex[i] = output.tell() + prev
1296
#                        s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
1297
#                        if (dictsize*2+len(marshal.dumps(setcol[i],2))>len(marshal.dumps(s,2))):
1298
#                            paxcols.append(i)
1299
#                            output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
1300
                        if i in paxcols:
1301
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
1302
                            #listofarrays.append(s)
1303
                        else:
1304
                            prevsets[i] = set(setcol[i]).copy()
1305
                            s = sorted(setcol[i])
1306
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
1307
                            coldict = coldicts[i]
1308
                            if len(s)<256:
1309
                                indextype='B'
1310
                            else:
1311
                                indextype='H'
1312
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
1313
                            output.write(zlib.compress(marshal.dumps(s,2)))
1314
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
1315
                else:
1316
#                    listofnditers1 = zip(*listofnditers)
1317
#                    listofnditers1.sort(key=lambda listofnditers1: listofnditers1[paxcols[0]])
1318
#                    listofnditers = zip(*listofnditers1)
1319
                    check=0
1320
                    for i in xrange(colnum):
1321
                        headindex[i] = output.tell() + prev
1322
                        if i in paxcols:
1323
                            output.write(zlib.compress(marshal.dumps(listofvals[i],2)))
1324
                            #listofarrays.append(s)
1325
                        else:
1326
                            difnew = list(setcol[i]-prevsets[i])
1327
                            difold = list(prevsets[i]-setcol[i])
1328
                            prevsets[i].intersection_update(setcol[i])
1329
                            prevsets[i].update(difnew)
1330
                            prevsets[i].update(difold[:(dictsize-len(prevsets[i]))])
1331
                            towrite = {}
1332
                            le = len(difold)
1333
                            si = len(coldicts[i])
1334
                            for l,j in enumerate(difnew):
1335
                                if l<le:
1336
                                    towrite[j] = coldicts[i][difold[l]]
1337
                                else:
1338
                                    towrite[j] = si
1339
                                    si+=1
1340
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
1341
                            coldict = coldicts[i]
1342
                            if len(prevsets[i]) != 0 :
1343
                                if len(prevsets[i])<256:
1344
                                    indextype='B'
1345
                                else:
1346
                                    indextype='H'
1347
                                output.write(zlib.compress(marshal.dumps(towrite.keys(),2)))
1348
                                output.write(zlib.compress(array(indextype,towrite.values()).tostring()))
1349
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
1350
                            output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
1351

    
1352
#                for i,l in enumerate(listofarrays):
1353
#                    if i in paxcols:
1354
#                        output.write(zlib.compress(marshal.dumps(sorted(l),2)))
1355
#                    else:
1356
#                        output.write(zlib.compress(l.tostring()))
1357

    
1358

    
1359

    
1360
                headindex[i+1] = output.tell()+ prev
1361
                headindex[i+2] = count
1362

    
1363
                count=0
1364
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
1365
                fileIter.write(output.getvalue())
1366
                listoflens = [0 for _ in xrange(colnum)]
1367
                for s in setcol:
1368
                    s.clear()
1369
                #listofarrays = []
1370
                listofnditers = []
1371
                gc.collect()
1372
                step = dictsize
1373
                blocknum+=1
1374

    
1375
            if ret:
1376
                break
1377

    
1378

    
1379
    if mode == 'spac':
1380
        colnum = len(schema)
1381
        marshal.dump(schema,fileIter,2)
1382
        setcol = [set([]) for _ in xrange(colnum)]
1383
#        compressorlistcols = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
1384
#        compressorlistdicts = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
1385
#        compressorlistvals = [zlib.BZ2Compressor(9) for _ in xrange(colnum)]
1386
        listoflens = [0 for _ in xrange(colnum)]
1387
        start = 0
1388
        dictsize = 65536
1389
        paxcols = []
1390
        indextype = 'H'
1391
        step = dictsize
1392
        stopstep = int(dictsize/20)
1393
        bsize = 0
1394
        listofnditers = []
1395
        numrows=0
1396
        dictlimit = step - colnum
1397
        coldicts = [{} for _ in xrange(colnum)]
1398
        prevsets =  [[] for _ in xrange(colnum)]
1399
        corelset = []
1400
        count = 0
1401
        blocknum = 0
1402
        bsize = 0
1403
        rowlimit = 0
1404
        compress = bz2.compress
1405

    
1406
        while True:
1407
            maxlen = 0.5
1408
            ret = False
1409
            z = zip(*itertools.islice(diter, 0, step))
1410

    
1411
            if z==[]:
1412
                ret = True
1413
            else:
1414
                if blocknum==0:
1415
                    for c in z:
1416
                        bsize+=sys.getsizeof(c)
1417

    
1418
                listofnditers.append(z)
1419

    
1420

    
1421
                if len(listofnditers[-1])!=0:
1422
                    count += len(listofnditers[-1][0])
1423
                else:
1424
                    ret = 1
1425

    
1426
                for i,col in enumerate(listofnditers[-1]):
1427
                    if i not in paxcols:
1428
                        setcol[i].update(col)
1429
                        l = len(setcol[i])
1430
                        if l > maxlen:
1431
                            maxlen = l
1432

    
1433

    
1434
                if blocknum>1:
1435
                    listofvals = []
1436
                    listofvals.append([val for subl in [x[paxcols[0]] for x in listofnditers] for val in subl])
1437
                    if len(listofvals[0]) > rowlimit:
1438
                        bsize = 10000001
1439

    
1440

    
1441
            step = dictsize - maxlen
1442
            if step < stopstep or ret or bsize>1000000:
1443

    
1444
                bsize=0
1445
                prev = fileIter.tell() + 8*(colnum+2)
1446
                output = cStringIO.StringIO()
1447
                headindex = [0 for _ in xrange(colnum+2)]
1448
                listofvals = []
1449
                for j in xrange(colnum):
1450
                    listofvals.append([val for subl in [x[j] for x in listofnditers] for val in subl])
1451

    
1452
                if blocknum == 0:
1453
                    for i in xrange(colnum):
1454
                        headindex[i] = output.tell() + prev
1455

    
1456
#                        s =  [val for subl in [x[i] for x in listofnditers] for val in subl]
1457
#                        if (dictsize*2+len(marshal.dumps(setcol[i],2))>len(marshal.dumps(s,2))):
1458
#                            paxcols.append(i)
1459
#                            output.write(compressorlistdicts[i].compress(marshal.dumps(s,2)))
1460
                        if (len(setcol[i])*1.0/maxlen>0.67):
1461
                            paxcols.append(i)
1462
                            l = [0 for _ in xrange(3)]
1463
                            t = output.tell()
1464
                            output.write(struct.pack('L'*len(l), *l))
1465
                            output.write(compress(marshal.dumps(listofvals[i],2)))
1466
                            l[0] = output.tell()
1467
                            output.seek(t)
1468
                            output.write(struct.pack('L'*len(l), *l))
1469
                            output.seek(l[0])
1470

    
1471
                            #listofarrays.append(s)
1472
#                        elif i==8 or i==9:
1473
#                            if i==8:
1474
#                                pass
1475
#                            if i==9:
1476
#                                corellated = zip(listofvals[8],listofvals[9])
1477
#                                corelset = set(corellated)
1478
#                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset)))
1479
#                                coldict = coldicts[i]
1480
#                                s = zip(*corelset)
1481
#                                output.write(zlib.compress(marshal.dumps(s[0],2)))
1482
#                                output.write(zlib.compress(marshal.dumps(s[1],2)))
1483
#                                indextype='B'
1484
#                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset] ).tostring()))
1485
                        else:
1486
                            prevsets[i] = sorted(list(set(setcol[i]).copy()))
1487
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
1488
                            coldict = coldicts[i]
1489
                            if len(prevsets[i])<256:
1490
                                indextype='B'
1491
                            else:
1492
                                indextype='H'
1493
                            #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
1494
                            l = [0 for _ in xrange(3)]
1495
                            t = output.tell()
1496
                            output.write(struct.pack('L'*len(l), *l))
1497
                            output.write(compress(marshal.dumps(prevsets[i],2)))
1498
                            l[0] = output.tell()
1499
                            output.write(compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
1500
                            l[1] = output.tell()
1501
                            output.seek(t)
1502
                            output.write(struct.pack('L'*len(l), *l))
1503
                            output.seek(l[1])
1504
                else:
1505
#                    listofnditers1 = zip(*listofnditers)
1506
#                    listofnditers1.sort(key=lambda listofnditers1: listofnditers1[paxcols[0]])
1507
#                    listofnditers = zip(*listofnditers1)
1508
                    for i in xrange(colnum):
1509
                        headindex[i] = output.tell() + prev
1510
                        if i in paxcols:
1511
                            l = [0 for _ in xrange(3)]
1512
                            t = output.tell()
1513
                            output.write(struct.pack('L'*len(l), *l))
1514
                            output.write(compress(marshal.dumps(listofvals[i],2)))
1515
                            l[0] = output.tell() 
1516
                            output.seek(t)
1517
                            output.write(struct.pack('L'*len(l), *l))
1518
                            output.seek(l[0])
1519
                            if rowlimit==0:
1520
                                rowlimit = len(listofvals[i])
1521
                            
1522
                            #listofarrays.append(s)
1523
#                        elif i==8 or i==9:
1524
#                            if i==8:
1525
#                                pass
1526
#                            if i==9:
1527
#                                corellated = zip(listofvals[8],listofvals[9])
1528
#                                corelset1 = set(corellated)
1529
#                                difnew = list(corelset1-corelset)
1530
#                                difold = list(corelset-corelset1)
1531
#                                corelset.intersection_update(corelset1)
1532
#                                corelset.update(difnew)
1533
#                                corelset.update(difold)
1534
#                                towrite = {}
1535
#                                le = len(difold)
1536
#                                si = len(coldicts[i])
1537
#                                for l,j in enumerate(difnew):
1538
#                                    if l<le:
1539
#                                        towrite[j] = coldicts[i][difold[l]]
1540
#                                    else:
1541
#                                        towrite[j] = si
1542
#                                        si+=1
1543
#
1544
#                                coldicts[i] = dict(((x,y) for y,x in enumerate(corelset)))
1545
#                                coldict = coldicts[i]
1546
#                                s = zip(*towrite.keys())
1547
#                                if len(s)>0:
1548
#                                    output.write(zlib.compress(marshal.dumps(s[0],2)))
1549
#                                    output.write(zlib.compress(marshal.dumps(s[1],2)))
1550
#                                indextype='B'
1551
#                                output.write(zlib.compress(array(indextype,[coldict[val] for val in corelset] ).tostring()))
1552
                        else:
1553

    
1554
                            pset = set(prevsets[i])
1555
                            difnew = list(setcol[i] - pset)
1556

    
1557
                            s = []
1558
                            
1559
                            s = prevsets[i] + difnew
1560
                            d = 0
1561
                            if len(s) > dictsize:
1562
                                difold = list(pset - setcol[i])
1563

    
1564
                                while len(s)>dictsize:
1565
                                    s.remove(difold[d])
1566
                                    d+=1
1567
                                    
1568
                            prevsets[i] = s
1569
                            towritevalues = (x for x in xrange(len(coldicts[i]), len(coldicts[i]) + len(difnew)))
1570
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
1571
                            coldict = coldicts[i]
1572
                            l = [0 for _ in xrange(3)]
1573
                            t = output.tell()
1574
                            output.write(struct.pack('L'*len(l), *l))
1575
                            if len(prevsets[i]) != 0 :
1576
                                if len(prevsets[i])<256:
1577
                                    indextype='B'
1578
                                else:
1579
                                    indextype='H'
1580
                                if len(difnew)>10:
1581
                                    difnew, towritevalues = zip(*sorted(zip(difnew, towritevalues)))
1582
                                output.write(compress(marshal.dumps(difnew,2)))
1583
                                l[0] = output.tell()
1584
                                output.write(compress(array(indextype,towritevalues).tostring()))
1585
                                l[1] = output.tell()
1586
                            output.write(compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
1587
                            l[2] = output.tell()
1588
                            output.seek(t)
1589
                            output.write(struct.pack('L'*len(l), *l))
1590
                            output.seek(l[2])
1591
#                for i,l in enumerate(listofarrays):
1592
#                    if i in paxcols:
1593
#                        output.write(zlib.compress(marshal.dumps(sorted(l),2)))
1594
#                    else:
1595
#                        output.write(zlib.compress(l.tostring()))
1596

    
1597

    
1598
                        
1599
                headindex[i+1] = output.tell()+ prev
1600
                headindex[i+2] = count
1601
                
1602
                count=0
1603
                output.flush()
1604
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
1605
                fileIter.write(output.getvalue())
1606
                listoflens = [0 for _ in xrange(colnum)]
1607
                for s in setcol:
1608
                    s.clear()
1609
                #listofarrays = []
1610
                listofnditers = []
1611
                gc.collect()
1612
                step = dictsize
1613
                blocknum+=1
1614
                
1615
            if ret:
1616
                fileIter.close()
1617
                break
1618
            
1619
            
1620

    
1621
    
1622

    
1623

    
1624
   
1625

    
1626

    
1627
    if mode == 'dictpercol':
1628
        colnum = len(schema)
1629
        fastPickler.dump(schema)
1630
        listptr = [array('H') for _ in xrange(colnum) ]
1631
        compressorlistdicts = [zlib.compressobj(5) for _ in xrange(colnum)]
1632
        compressorcols = zlib.compressobj(5)
1633
        listofdicts = [{} for _ in xrange(colnum) ]
1634
        listoflens = [-1 for _ in xrange(colnum)]
1635
        start = 0
1636
        bsize = 0
1637
        numrows=0
1638
        dictlimit = 65536 - colnum
1639
        for row in diter:
1640
            numrows+=1
1641

    
1642
            for i,val in enumerate(row):
1643
                if val in listofdicts[i]:
1644
                    listptr[i].append(listofdicts[i][val])
1645
                else:
1646
                    listoflens[i] += 1
1647
                    listofdicts[i][val] = listoflens[i]
1648
                    listptr[i].append(listoflens[i])
1649
                    if type(val) in (unicode,str):
1650
                        bsize += len(val)
1651
            if bsize>BLOCK_SIZE  or max(listoflens) >= dictlimit:
1652
                prev = fileIter.tell()
1653
                headindex = [0 for _ in xrange(colnum+3)]
1654
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
1655
                for i,valset in enumerate(listofdicts):
1656
                    headindex[i] = fileIter.tell()
1657
                    fileIter.write(compressorlistdicts[i].compress(marshal.dumps(sorted(valset, key=valset.get))))
1658
                headindex[colnum] = fileIter.tell()
1659
                for ar in listptr:
1660
                    fileIter.write(compressorcols.compress(ar.tostring()))
1661
                headindex[colnum+1] = fileIter.tell()
1662
                headindex[colnum+2] = numrows
1663
                fileIter.seek(prev)
1664
                fileIter.write(struct.pack('L'*len(headindex), *headindex))
1665
                fileIter.seek(headindex[colnum+1])
1666
                listofdicts = [{} for _ in xrange(colnum) ]
1667
                listoflens = [-1 for _ in xrange(colnum)]
1668
                bsize=0
1669
                numrows=0
1670
                listptr = [array('H') for _ in xrange(colnum) ]
1671
                gc.collect()
1672
        prev = fileIter.tell()
1673
        headindex = [0 for _ in xrange(colnum+3)]
1674
        fileIter.write(struct.pack('L'*len(headindex), *headindex))
1675
        for i,valset in enumerate(listofdicts):
1676
            headindex[i] = fileIter.tell()
1677
            fileIter.write(compressorlistdicts[i].compress(marshal.dumps(sorted(valset, key=valset.get))))
1678
        headindex[colnum] = fileIter.tell()
1679
        for ar in listptr:
1680
            fileIter.write(compressorcols.compress(ar.tostring()))
1681
        headindex[colnum+1] = fileIter.tell()
1682
        fileIter.seek(prev)
1683
        headindex[colnum+2] = numrows
1684
        fileIter.write(struct.pack('L'*len(headindex), *headindex))
1685

    
1686

    
1687

    
1688

    
1689

    
1690
    if mode == 'valdictcols1':
1691
        colnum = len(schema)
1692
        cPickle.dump(schema, fileIter,1)
1693
        valset = {}
1694
        lenvalset = 0
1695
        listptr = [array('H') for _ in xrange(colnum) ]
1696
        listptrappend = [listptr[i].append for i in xrange(colnum)]
1697
        start = 0
1698
        bsize = 0
1699
        dictlimit = 65536-colnum
1700
        count=0
1701
        vdefault = valset.setdefault
1702

    
1703
        for row in diter:
1704

    
1705

    
1706
            for i,app in enumerate(listptrappend):
1707
                d = vdefault(row[i], lenvalset)
1708
                app(d)
1709
                if d == lenvalset:
1710
                    lenvalset += 1
1711
                    count+=1
1712

    
1713
#            for i,val in enumerate(row):
1714
#                if val in valset:
1715
#                    listptrappend[i](valset[val])
1716
#                else:
1717
#                    listptrappend[i](lenvalset)
1718
#                    valset[val] = lenvalset
1719
#                    lenvalset += 1
1720
#                    count += 1
1721

    
1722

    
1723
#                d = vdefault(val, lenvalset)
1724
#                if d == lenvalset:
1725
#                    lenvalset += 1
1726
#                    count+=1
1727
#                listptrappend[i](d)
1728

    
1729
#
1730

    
1731
#                    if type(val) in (unicode,str):
1732
#                        bsize += len(val)
1733

    
1734
            if count == 2048:
1735
                #bsize = sys.getsizeof(valset.keys())
1736
                count = 0
1737
                        
1738
            if bsize>BLOCK_SIZE  or lenvalset >= dictlimit:
1739
                fastPickler.dump(sorted(valset.keys()))
1740
#                fastPickler.dump(listptr)
1741
                for ar in listptr:
1742
                    ar.tofile(fileIter)
1743
           #     fileIter.write( struct.pack(fmt * len(listptr[0]), *(j for i in listptr for j in i)))
1744
                valset.clear()
1745
                lenvalset = 0
1746
                bsize=0
1747
                listptr = [array('H') for _ in xrange(colnum) ]
1748
                gc.collect()
1749
        fastPickler.dump(sorted(valset.keys()))
1750
#        fastPickler.dump(listptr)
1751
        #fileIter.write( struct.pack(fmt * len(listptr[0]), *(j for i in listptr for j in i)))
1752
        for ar in listptr:
1753
            ar.tofile(fileIter)
1754

    
1755
            
1756

    
1757
    if mode == 'valdictcols':
1758
        try:
1759
            colnum = len(schema)
1760
            cPickle.dump(schema, fileIter,1)
1761
            valset = {}
1762
            lenvalset = -1
1763
            listptr = [[] for _ in xrange(colnum) ]
1764
            start = 0
1765
            step = 1024
1766
            currentblock = fileIter.tell()
1767
            count = 0
1768
            bsize = 0
1769
            nditer = zip(*itertools.islice(diter, 0, step))
1770
            gc.disable()
1771
            while True:
1772
                check = 0
1773
                i = -1
1774
                for col in nditer:
1775
                    i+=1
1776
                    check=1
1777
                    for val in col:
1778
                        if val in valset:
1779
                            listptr[i].append(valset[val])
1780
                        else:
1781
                            lenvalset += 1
1782
                            valset[val] = lenvalset
1783
                            listptr[i].append(lenvalset)
1784
                    bsize += sys.getsizeof(col)
1785
                count += step
1786
                if bsize>BLOCK_SIZE or count>=(65536-1024)/colnum or check == 0:
1787
                    fastPickler.dump(sorted(valset, key=valset.get))
1788
                    fileIter.write( struct.pack('H' * (len(listptr[0])*colnum), *(j for i in listptr for j in i)))
1789
                    valset = {}
1790
                    lenvalset = -1
1791
                    count = 0
1792
                    listptr = [[] for _ in xrange(colnum)]
1793
                    if check==0:
1794
                        break
1795

    
1796
                nditer = izip(*itertools.islice(diter, start, step))
1797

    
1798
            gc.enable()
1799
        except StopIteration,e:
1800
            gc.enable()
1801
            pass
1802

    
1803
    if mode == 'valdictrows':
1804
        try:
1805
            colnum = len(schema)
1806
            cPickle.dump(schema, fileIter,1)
1807
            valset = {}
1808
            lenvalset = -1
1809
            listptr = []
1810
            start = 0
1811
            step = 1024
1812
            currentblock = fileIter.tell()
1813
            count = 0
1814
            bsize = 0
1815
            nditer = zip(*itertools.islice(diter, 0, step))
1816
            gc.disable()
1817
            while True:
1818
                check = 0
1819
                i = 0
1820
                for col in nditer:
1821
                    i += 1
1822
                    check=1
1823
                    colcount = -1
1824
                    for val in col:
1825
                        colcount+=1
1826
                        if  i == 1:
1827
                            listptr.append([])
1828
                        if val in valset:
1829
                            listptr[count + colcount].append(valset[val])
1830
                        else:
1831
                            lenvalset += 1
1832
                            valset[val] = lenvalset
1833
                            listptr[count + colcount].append(lenvalset)
1834
                    bsize += sys.getsizeof(col)
1835
                count += step
1836
                if bsize>BLOCK_SIZE or count>=65536 or check == 0:
1837
                    fastPickler.dump(sorted(valset, key=valset.get))
1838
                    fileIter.write( struct.pack('i' * (len(listptr)*colnum), *(j for i in listptr for j in i)))
1839
                    valset = {}
1840
                    lenvalset = -1
1841
                    count = 0
1842
                    listptr = []
1843
                    if check==0:
1844
                        break
1845

    
1846
                nditer = izip(*itertools.islice(diter, start, step))
1847

    
1848
            gc.enable()
1849
        except StopIteration,e:
1850
            gc.enable()
1851
            pass
1852

    
1853
    if mode == 'topdict':
1854
        try:
1855
            colnum = len(schema)
1856
            cPickle.dump(schema, fileIter,1)
1857
            todisk = [{} for _ in xrange(colnum)]
1858
            start = 0
1859
            step = 2048/colnum
1860
            currentblock = fileIter.tell()
1861
            recnum = 0
1862
            bsize = 0
1863
            while True:
1864
                notmlist=0
1865
                temp = []
1866
                while bsize < BLOCK_SIZE  and recnum!=65536/colnum:
1867
                    nditer = itertools.islice(diter, start, step)
1868
                    mlist = list(nditer)
1869
                    if not mlist:
1870
                        notmlist=1
1871
                    #bsize += sys.getsizeof(mlist)
1872
                    recnum += step
1873
                    mlist = mlist+temp
1874
                    temp = mlist
1875
                valset = OrderedDict()
1876
                lenvalset = -1
1877
                listptr = []
1878
                rowcount = 0
1879
                for row in mlist:
1880
                    colcount=0
1881
                    listptr.append([])
1882
                    for col in row:
1883
                        if col in valset:
1884
                            listptr[rowcount].append(valset[col])
1885
                        else:
1886
                            lenvalset += 1
1887
                            valset[col] = lenvalset
1888
                            listptr[rowcount].append(lenvalset)
1889
                        colcount+=1
1890
                    rowcount+=1
1891

    
1892
                fastPickler.dump(list(valset.keys()))
1893
                fileIter.write( struct.pack('I' * (len(listptr)*colnum), *(j for i in listptr for j in i)))
1894
                bsize = 0
1895
                recnum = 0
1896
                if notmlist==1:
1897
                    break
1898

    
1899

    
1900
        except StopIteration,e:
1901
            pass
1902

    
1903

    
1904

    
1905
    if mode == 'storeindict':
1906
        try:
1907
            colnum = len(schema)
1908
            cPickle.dump(schema, fileIter,1)
1909
            todisk = [{} for _ in xrange(colnum)]
1910
            start = 0
1911
            step = 1024
1912
            currentblock = fileIter.tell()
1913
            count = 0
1914
            bsize = 0
1915
            nditer = zip(*itertools.islice(diter, 0, step))
1916
            gc.disable()
1917
            while True:
1918
                i = 0
1919
                check = 0
1920
                for col in nditer:
1921
                    check=1
1922
                    valid=0
1923
                    for val in col:
1924
                        try:
1925
                            todisk[i][val].append(valid)
1926
                        except:
1927
                            todisk[i][val] = [valid]
1928
                        valid+=1
1929
                    bsize += sys.getsizeof(col)
1930
                    i+=1
1931
                    count += step
1932
                if bsize>BLOCK_SIZE or count==32768:
1933
                    output = cStringIO.StringIO()
1934
                    fastPickler = cPickle.Pickler(output, 1)
1935
                    prev = 0
1936
                    fastPickler.fast = 1
1937
                    index_loc = currentblock + (colnum+1)*8
1938
                    index = [0 for _ in xrange(colnum+1)]
1939
                    ind = -1
1940
                    for col in todisk:
1941
                        ind += 1
1942
                        index[ind] = prev + index_loc
1943
                        fastPickler.dump(col)
1944
                        prev = output.tell()
1945
                    index[ind+1] = prev + index_loc
1946
                    currentblock = prev+index_loc
1947
                    fileIter.write(struct.pack('L'*len(index), *index))
1948
                    fileIter.write(output.getvalue())
1949
                    todisk = [{} for _ in xrange(len(schema))]
1950
                    count = 0
1951
                    bsize = 0
1952

    
1953
                nditer = izip(*itertools.islice(diter, start, step))
1954
                if check == 0:
1955
                    output = cStringIO.StringIO()
1956
                    fastPickler = cPickle.Pickler(output, 1)
1957
                    prev = 0
1958
                    fastPickler.fast = 1
1959
                    index_loc = currentblock + (colnum+1)*8
1960
                    index = [0 for _ in xrange(colnum+1)]
1961
                    ind = -1
1962
                    for col in todisk:
1963
                        ind += 1
1964
                        index[ind] = prev + index_loc
1965
                        fastPickler.dump(col)
1966
                        prev = output.tell()
1967
                    index[ind+1] = prev + index_loc
1968
                    currentblock = prev+index_loc
1969
                    fileIter.write(struct.pack('L'*len(index), *index))
1970
                    fileIter.write(output.getvalue())
1971
                    break
1972

    
1973
            gc.enable()
1974
        except StopIteration,e:
1975
            gc.enable()
1976
            pass
1977

    
1978
    def spac(fileObject,lencols):
1979
        colnum = len(schema)-1
1980
        marshal.dump(schema[1:],fileObject,2)
1981
        setcol = [set([]) for _ in xrange(colnum)]
1982
        dictsize = 65536
1983
        paxcols = []
1984
        indextype = 'H'
1985
        index_init = [0 for _ in xrange(3)]
1986
        coldicts = [{} for _ in xrange(colnum)]
1987
        prevsets =  [[] for _ in xrange(colnum)]
1988
        count = 0
1989
        blocknum = 0
1990
        compress = bz2.compress
1991
        
1992

    
1993
        while True:
1994
            maxlen = 0
1995
            exitGen = False
1996
            rows = []
1997
            try:
1998
                for i in xrange(lencols):
1999
                    rows.append((yield))
2000
            except GeneratorExit:
2001
                exitGen = True
2002
            listofvals = zip(*rows)
2003

    
2004
            if listofvals!=[]:
2005

    
2006
                for i,col in enumerate(listofvals):
2007
                    if i not in paxcols:
2008
                        setcol[i].update(col)
2009

    
2010
                prev = fileObject.tell() + 8*(colnum+2)
2011
                output = cStringIO.StringIO()
2012
                headindex = [0 for _ in xrange(colnum+2)]
2013

    
2014
                if blocknum == 0:
2015
                    for i in xrange(colnum):
2016
                        headindex[i] = output.tell() + prev
2017
                        if (len(setcol[i])*1.0/lencols>0.67):
2018
                            paxcols.append(i)
2019
                            l = index_init[:]
2020
                            t = output.tell()
2021
                            output.write(struct.pack('L'*len(l), *l))
2022
                            output.write(compress(marshal.dumps(listofvals[i],2)))
2023
                            l[0] = output.tell()
2024
                            output.seek(t)
2025
                            output.write(struct.pack('L'*len(l), *l))
2026
                            output.seek(l[0])
2027
                        else:
2028
                            prevsets[i] = list(set(setcol[i]).copy())
2029
                            coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
2030
                            coldict = coldicts[i]
2031
                            if len(prevsets[i])<256:
2032
                                indextype='B'
2033
                            else:
2034
                                indextype='H'
2035
                            l = index_init[:]
2036
                            t = output.tell()
2037
                            output.write(struct.pack('L'*len(l), *l))
2038
                            output.write(compress(marshal.dumps(prevsets[i],2)))
2039
                            l[0] = output.tell()
2040
                            output.write(compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
2041
                            l[1] = output.tell()
2042
                            output.seek(t)
2043
                            output.write(struct.pack('L'*len(l), *l))
2044
                            output.seek(l[1])
2045
                else:
2046
                    for i in xrange(colnum):
2047
                        headindex[i] = output.tell() + prev
2048
                        if i in paxcols:
2049
                            l = index_init[:]
2050
                            t = output.tell()
2051
                            output.write(struct.pack('L'*len(l), *l))
2052
                            output.write(compress(marshal.dumps(listofvals[i],2)))
2053
                            l[0] = output.tell()
2054
                            output.seek(t)
2055
                            output.write(struct.pack('L'*len(l), *l))
2056
                            output.seek(l[0])
2057
                            
2058
                        else:
2059
                            pset = set(prevsets[i])
2060
                            difnew = list(setcol[i] - pset)
2061
                            s = prevsets[i] + difnew
2062
                            d = 0
2063
                            if len(s) > dictsize:
2064
                                difold = list(pset - setcol[i])
2065
                                while len(s)>dictsize:
2066
                                    s.remove(difold[d])
2067
                                    d+=1
2068

    
2069
                            prevsets[i] = s
2070
                            coldicts[i] = dict(((x,y) for y,x in enumerate(s)))
2071
                            coldict = coldicts[i]
2072
                            towritevalues = (x for x in xrange(len(coldict)-d, len(coldict)))
2073

    
2074
                            
2075
                            l = index_init[:]
2076
                            t = output.tell()
2077
                            output.write(struct.pack('L'*len(l), *l))
2078
                            if len(prevsets[i]) != 0 :
2079
                                if len(prevsets[i])<256:
2080
                                    indextype='B'
2081
                                else:
2082
                                    indextype='H'
2083
                                output.write(compress(marshal.dumps(difnew,2)))
2084
                                l[0] = output.tell()
2085
                                output.write(compress(array(indextype,towritevalues).tostring()))
2086
                                l[1] = output.tell()
2087

    
2088
                            output.write(compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
2089
                            l[2] = output.tell()
2090
                            output.seek(t)
2091
                            output.write(struct.pack('L'*len(l), *l))
2092
                            output.seek(l[2])
2093

    
2094
                headindex[i+1] = output.tell()+ prev
2095
                headindex[i+2] = count
2096
                count=0
2097
                fileObject.write(struct.pack('L'*len(headindex), *headindex))
2098
                fileObject.write(output.getvalue())
2099
                for s in setcol:
2100
                    s.clear()
2101
                gc.collect()
2102
                blocknum+=1
2103

    
2104
            if exitGen:
2105
                fileObject.close()
2106
                break
2107

    
2108

    
2109
    def sorteddictpercol(fileIter,lencols,compression,level):
2110
        colnum = len(schema)-1
2111
        fileIter.write(struct.pack('!B', 0))
2112
        cPickle.dump(schema[1:],fileIter,1)
2113
        paxcols = []
2114
        blocknum = 0
2115
        output = cStringIO.StringIO()
2116
#        tempio = cStringIO.StringIO()
2117
#        fastPickler = cPickle.Pickler(tempio, 2)
2118
#        fastPickler.fast = 1
2119
        exitGen=False
2120
        compress = zlib.compress
2121
        if compression == BZ2:
2122
            compress = bz2.compress
2123
        if lencols == 0:
2124
            (yield)
2125

    
2126
        
2127
        while not exitGen:
2128
            output.truncate(0)
2129
            mrows = []
2130
            try:
2131
                for i in xrange(lencols):
2132
                    mrows.append((yield))
2133
            except GeneratorExit:
2134
                exitGen = True
2135
            
2136
            count = len(mrows)
2137
            headindex = [0 for _ in xrange((colnum*2)+1)]
2138
            type = '!'+'i'*len(headindex)
2139
            output.write(struct.pack(type, *headindex))
2140
            if mrows != []:
2141
                for i, col in enumerate((tuple(x[c] for x in mrows) for c in xrange(colnum))):
2142
                    if blocknum==0:
2143
                        s = sorted(set(col))
2144
                        lens = len(s)
2145
                        if lens>50*1.0*count/100:
2146
                            paxcols.append(i)
2147
                            l = output.tell()
2148
#                            tempio.truncate(0)
2149
#                            fastPickler.dump(col)
2150
                            output.write(zlib.compress(marshal.dumps(col),level))
2151
                            headindex[i*2] = output.tell() - l
2152
                        else:
2153
                            coldict = dict(((x,y) for y,x in enumerate(s)))
2154
                            l = output.tell()
2155
#                            tempio.truncate(0)
2156
#                            fastPickler.dump(s)
2157
                            output.write(zlib.compress(marshal.dumps(s),level))
2158
                            headindex[i*2] = output.tell()-l
2159
                            if lens>1:
2160
                                if lens<256:
2161
                                    output.write(compress(array('B',[coldict[y] for y in col]).tostring(),level))
2162
                                else:
2163
                                    output.write(compress(array('H',[coldict[y] for y in col]).tostring(),level))
2164
                            headindex[i*2+1] = output.tell()-l-headindex[i*2]
2165
                    else:
2166
                        if i in paxcols:
2167
                            l = output.tell()
2168
#                            tempio.truncate(0)
2169
#                            fastPickler.dump(col)
2170
                            output.write(zlib.compress(marshal.dumps(col),level))
2171
                            headindex[i*2] = output.tell() - l
2172
                        else:
2173
                            s = sorted(set(col))
2174
                            lens = len(s)
2175
                            coldict = dict(((x,y) for y,x in enumerate(s)))
2176
                            l = output.tell()
2177
#                            tempio.truncate(0)
2178
#                            fastPickler.dump(s)
2179
                            output.write(zlib.compress(marshal.dumps(s),level))
2180
                            headindex[i*2] = output.tell()-l
2181
                            if lens>1:
2182
                                if lens<256:
2183
                                    output.write(compress(array('B',[coldict[y] for y in col]).tostring(),level))
2184
                                else:
2185
                                    output.write(compress(array('H',[coldict[y] for y in col]).tostring(),level))
2186
                            headindex[i*2+1] = output.tell()-l-headindex[i*2]
2187

    
2188
                blocknum=1
2189
                headindex[colnum*2] = count
2190
                output.seek(0)
2191
                type = '!'+'i'*len(headindex)
2192
                output.write(struct.pack(type, *headindex))
2193
                cz = output.getvalue()
2194
                fileIter.write(struct.pack('!B', 1))
2195
                fileIter.write(struct.pack('!i',len(cz)))
2196

    
2197
                fileIter.write(cz)
2198
                gc.collect()
2199
        fileIter.close()
2200

    
2201

    
2202
    def corsorteddictpercol(fileIter,lencols,compression,level):
2203
        colnum = len(schema)-1
2204
        fileIter.write(struct.pack('!B', 0))
2205
        cPickle.dump(schema[1:],fileIter,1)
2206
        paxcols = []
2207
        blocknum = 0
2208
        output = cStringIO.StringIO()
2209
        tempio = cStringIO.StringIO()
2210
        fastPickler = cPickle.Pickler(tempio, 2)
2211
        fastPickler.fast = 1
2212
        exitGen=False
2213
        corelcols = {}
2214
        compress = zlib.compress
2215
        if compression == BZ2:
2216
            compress = bz2.compress
2217
        if lencols == 0:
2218
            (yield)
2219

    
2220

    
2221
        while not exitGen:
2222
            output.truncate(0)
2223
            mrows = []
2224
            try:
2225
                for i in xrange(lencols):
2226
                    mrows.append((yield))
2227
            except GeneratorExit:
2228
                exitGen = True
2229

    
2230
            count = len(mrows)
2231
            headindex = [0 for _ in xrange((colnum*2)+1)]
2232
            type = '!'+'i'*len(headindex)
2233
            colsets = [set([]) for _ in xrange(colnum)]
2234
            lencolsets = [[] for _ in xrange(colnum)]
2235
            if mrows != []:
2236
                if blocknum == 0:
2237
                    for i, col in enumerate((tuple(x[c] for x in mrows) for c in xrange(colnum))):
2238
                        colsets[i] = set(col)
2239
                        lencolsets[i] = len(colsets[i])
2240
                        if lencolsets[i]>50*1.0*count/100:
2241
                            paxcols.append(i)
2242
                            l = output.tell()
2243
                            tempio.truncate(0)
2244
                            fastPickler.dump(col)
2245
                            output.write(zlib.compress(tempio.getvalue(),level))
2246
                            headindex[i*2] = output.tell() - l
2247
                        else:
2248
                            s = sorted(colsets[i])
2249
                            lens = lencolsets[i]
2250
                            coldict = dict(((x,y) for y,x in enumerate(s)))
2251
                            l = output.tell()
2252
                            tempio.truncate(0)
2253
                            fastPickler.dump(s)
2254
                            output.write(compress(tempio.getvalue(),level))
2255
                            headindex[i*2] = output.tell()-l
2256
                            if lens>1:
2257
                                if lens<256:
2258
                                    output.write(compress(array('B',[coldict[y] for y in col]).tostring(),level))
2259
                                else:
2260
                                    output.write(compress(array('H',[coldict[y] for y in col]).tostring(),level))
2261
                            headindex[i*2+1] = output.tell()-l-headindex[i*2]
2262

    
2263
                    notpaxcols = [x for x in xrange(colnum) if x not in paxcols]
2264
                    corelcandidates = []
2265
                    for i in xrange(2,len(notpaxcols)+1):
2266
                        corelcandidates += list(itertools.combinations(notpaxcols,i))
2267
                    print len(corelcandidates)
2268

    
2269

    
2270
                else:
2271
                    for i, col in enumerate((tuple(x[c] for x in mrows) for c in xrange(colnum))):
2272

    
2273
                        if i in paxcols:
2274
                            l = output.tell()
2275
                            tempio.truncate(0)
2276
                            fastPickler.dump(col)
2277
                            output.write(zlib.compress(tempio.getvalue(),level))
2278
                            headindex[i*2] = output.tell() - l
2279
                        else:
2280
                            s = sorted(set(col))
2281
                            lens = len(s)
2282
                            coldict = dict(((x,y) for y,x in enumerate(s)))
2283
                            l = output.tell()
2284
                            tempio.truncate(0)
2285
                            fastPickler.dump(s)
2286
                            output.write(compress(tempio.getvalue(),level))
2287
                            headindex[i*2] = output.tell()-l
2288
                            if lens>1:
2289
                                if lens<256:
2290
                                    output.write(compress(array('B',[coldict[y] for y in col]).tostring(),level))
2291
                                else:
2292
                                    output.write(compress(array('H',[coldict[y] for y in col]).tostring(),level))
2293
                            headindex[i*2+1] = output.tell()-l-headindex[i*2]
2294

    
2295
                blocknum=1
2296
                headindex[colnum*2] = count
2297
                output.seek(0)
2298
                type = '!'+'i'*len(headindex)
2299
                output.write(struct.pack(type, *headindex))
2300
                cz = output.getvalue()
2301
                fileIter.write(struct.pack('!B', 1))
2302
                fileIter.write(struct.pack('!i',len(cz)))
2303

    
2304
                fileIter.write(cz)
2305
                gc.collect()
2306
        fileIter.close()
2307

    
2308

    
2309
    def sorteddictpercolspac(fileIter,lencols):
2310
        colnum = len(schema)-1
2311
        fileIter.write(struct.pack('!B', 0))
2312
        cPickle.dump(schema[1:],fileIter,1)
2313
        paxcols = []
2314
        spaccols = []
2315
        blocknum = 0
2316
        output = cStringIO.StringIO()
2317
        tempio = cStringIO.StringIO()
2318
        fastPickler = cPickle.Pickler(tempio, 1)
2319
        fastPickler.fast = 1
2320
        exitGen=False
2321
        firstblocksets =  [set([]) for _ in xrange(colnum)]
2322
        if lencols == 0:
2323
            (yield)
2324

    
2325
        while not exitGen:
2326
            output.truncate(0)
2327
            mrows = []
2328
            try:
2329
                for i in xrange(lencols):
2330
                    mrows.append((yield))
2331
            except GeneratorExit:
2332
                exitGen = True
2333

    
2334
            count = len(mrows)
2335
            headindex = [0 for _ in xrange((colnum*2)+1)]
2336
            type = '!'+'i'*len(headindex)
2337
            output.write(struct.pack(type, *headindex))
2338
            if mrows != []:
2339
                for i, col in enumerate((tuple(x[c] for x in mrows) for c in xrange(colnum))):
2340
                    if blocknum==0:
2341
                        firstblocksets[i] = sorted(set(col))
2342
                        s = firstblocksets[i]
2343
                        lens = len(s)
2344
                        if lens>50*1.0*count/100:
2345
                            paxcols.append(i)
2346
                            l = output.tell()
2347
                            tempio.truncate(0)
2348
                            fastPickler.dump(col)
2349
                            output.write(zlib.compress(tempio.getvalue(),5))
2350
                            headindex[i*2] = output.tell() - l
2351
                        else:
2352
                            coldict = dict(((x,y) for y,x in enumerate(s)))
2353
                            l = output.tell()
2354
                            tempio.truncate(0)
2355
                            fastPickler.dump(s)
2356
                            output.write(zlib.compress(tempio.getvalue(),5))
2357
                            headindex[i*2] = output.tell()-l
2358
                            if lens>1:
2359
                                if lens<256:
2360
                                    output.write(zlib.compress(array('B',[coldict[y] for y in col]).tostring(),5))
2361
                                else:
2362
                                    output.write(zlib.compress(array('H',[coldict[y] for y in col]).tostring(),5))
2363
                            headindex[i*2+1] = output.tell()-l-headindex[i*2]
2364
                    elif blocknum==1:
2365
                         if i in paxcols:
2366
                            l = output.tell()
2367
                            tempio.truncate(0)
2368
                            fastPickler.dump(col)
2369
                            output.write(zlib.compress(tempio.getvalue(),5))
2370
                            headindex[i*2] = output.tell() - l
2371
                         else:
2372
                            s = sorted(set(col))
2373
                            difsets = list(set(s)-set(firstblocksets[i]))
2374
                            lendif = len(difsets)
2375
                            lenfirst = len(firstblocksets[i])
2376
                            if dif*1.0/lenfirst < 0.1:
2377
                                if (lenfirst < 135) or (lenfirst < 28000 and lenfirst > 256):
2378
                                    spaccols.append(i)
2379
                                    difold = list(prevsets[i]-setcol[i])
2380
                                    prevsets[i].intersection_update(setcol[i])
2381
                                    prevsets[i].update(difnew)
2382
                                    prevsets[i].update(difold[:(dictsize-len(prevsets[i]))])
2383
                                    towrite = {}
2384
                                    le = len(difold)
2385
                                    si = len(coldicts[i])
2386
                                    for l,j in enumerate(difnew):
2387
                                        if l<le:
2388
                                            towrite[j] = coldicts[i][difold[l]]
2389
                                        else:
2390
                                            towrite[j] = si
2391
                                            si+=1
2392
                                    coldicts[i] = dict(((x,y) for y,x in enumerate(prevsets[i])))
2393
                                    coldict = coldicts[i]
2394
                                    if len(prevsets[i]) != 0 :
2395
                                        if len(prevsets[i])<256:
2396
                                            indextype='B'
2397
                                        else:
2398
                                            indextype='H'
2399
                                        output.write(zlib.compress(marshal.dumps(towrite.keys(),2)))
2400
                                        output.write(zlib.compress(array(indextype,towrite.values()).tostring()))
2401
                                    #listofarrays.append(array(indextype,[coldict[val] for subl in [x[i] for x in listofnditers] for val in subl]))
2402
                                    output.write(zlib.compress(array(indextype,[coldict[val] for val in listofvals[i]] ).tostring()))
2403

    
2404

    
2405
                                else:
2406
                                    lens = len(s)
2407
                                    coldict = dict(((x,y) for y,x in enumerate(s)))
2408
                                    l = output.tell()
2409
                                    tempio.truncate(0)
2410
                                    fastPickler.dump(s)
2411
                                    output.write(zlib.compress(tempio.getvalue(),5))
2412
                                    headindex[i*2] = output.tell()-l
2413
                                    if lens>1:
2414
                                        if lens<256:
2415
                                            output.write(zlib.compress(array('B',[coldict[y] for y in col]).tostring(),5))
2416
                                        else:
2417
                                            output.write(zlib.compress(array('H',[coldict[y] for y in col]).tostring(),5))
2418
                                    headindex[i*2+1] = output.tell()-l-headindex[i*2]
2419
                    else:
2420
                        if i in paxcols:
2421
                            l = output.tell()
2422
                            tempio.truncate(0)
2423
                            fastPickler.dump(col)
2424
                            output.write(zlib.compress(tempio.getvalue(),5))
2425
                            headindex[i*2] = output.tell() - l
2426
                        if i in spaccols:
2427
                            pass
2428
                        else:
2429
                            s = sorted(set(col))
2430
                            lens = len(s)
2431
                            coldict = dict(((x,y) for y,x in enumerate(s)))
2432
                            l = output.tell()
2433
                            tempio.truncate(0)
2434
                            fastPickler.dump(s)
2435
                            output.write(zlib.compress(tempio.getvalue(),5))
2436
                            headindex[i*2] = output.tell()-l
2437
                            if lens>1:
2438
                                if lens<256:
2439
                                    output.write(zlib.compress(array('B',[coldict[y] for y in col]).tostring(),5))
2440
                                else:
2441
                                    output.write(zlib.compress(array('H',[coldict[y] for y in col]).tostring(),5))
2442
                            headindex[i*2+1] = output.tell()-l-headindex[i*2]
2443

    
2444
                blocknum+=1
2445
                headindex[colnum*2] = count
2446
                output.seek(0)
2447
                type = '!'+'i'*len(headindex)
2448
                output.write(struct.pack(type, *headindex))
2449
                cz = output.getvalue()
2450
                fileIter.write(struct.pack('!B', 1))
2451
                fileIter.write(struct.pack('!i',len(cz)))
2452

    
2453
                fileIter.write(cz)
2454
                gc.collect()
2455
        fileIter.close()
2456

    
2457
    def rcfile(fileObject,lencols,compression,level):
2458
        colnum = len(schema) - 1
2459
        structHeader = '!'+'i' * colnum
2460
        indexinit = [0 for _ in xrange(colnum)]
2461
        fileObject.write(struct.pack('!B', 0))
2462
        cPickle.dump(schema[1:],fileObject,1)
2463
#        l = cStringIO.StringIO()
2464
#        fastPickler = cPickle.Pickler(l, 2)
2465
#        fastPickler.fast = 1
2466
        exitGen = False
2467
        compress = zlib.compress
2468
        if compression == BZ2:
2469
            compress = bz2.compress
2470
        if lencols == 0:
2471
            (yield)
2472
            
2473
        while not exitGen:
2474
            rows = []
2475
            try:
2476
                for i in xrange(lencols):
2477
                    rows.append((yield))
2478
            except GeneratorExit:
2479
                exitGen = True
2480
                
2481
            index = indexinit[:]
2482
            output = cStringIO.StringIO()
2483
            
2484
            output.write(struct.pack('!B', 1))
2485
            output.write(struct.pack(structHeader, *index))
2486
            if rows != []:
2487
                for i, col in enumerate(([x[c] for x in rows] for c in xrange(colnum))):
2488
#                    l.truncate(0)
2489
#                    fastPickler.dump(col)
2490
                    cz = zlib.compress(listser.dumps(col), 5)
2491
                    output.write(cz)
2492
                    index[i] = len(cz)
2493
                output.seek(1)
2494
                output.write(struct.pack(structHeader, *index))
2495
                fileObject.write(output.getvalue())
2496

    
2497

    
2498
        fileObject.close()
2499

    
2500
    def rcfilenonsplit(fileObject=fileIter,colnum = (len(schema))):
2501
            structHeader = 'L' * (colnum+2)
2502
            indexinit = [0 for _ in xrange(colnum+2)]
2503
            marshal.dump(schema[len(schema)-colnum:], fileObject,2)
2504
            todisk = [[] for _ in xrange(colnum)]
2505
            start = 0
2506
            step = 1024
2507
            currentblock = fileObject.tell()
2508
            bsize = 0
2509
            lencols = 0
2510
            blocknum = 0
2511
            exitGen = 1
2512
            while exitGen:
2513
                nditer = izip(*itertools.islice(diter, start, step))
2514
                i = 0
2515
                exitGen = 0
2516
                for col in nditer:
2517
                    exitGen=1
2518
                    todisk[i] += col
2519
                    if blocknum == 0:
2520
                        for val in col:
2521
                            bsize += getSize(val)
2522
                    elif i == 0 and len(todisk[0]) >= lencols:
2523
                        bsize = BLOCK_SIZE+1
2524

    
2525
                    i+=1
2526
                if bsize>BLOCK_SIZE or not exitGen:
2527
                    index = indexinit[:]
2528
                    lencols = len(todisk[0])
2529
                    blocknum += 1
2530
                    ind = -1
2531
                    output = cStringIO.StringIO()
2532
                    indi = fileObject.tell()
2533
                    output.write(struct.pack(structHeader, *index))
2534
                    for i,col in enumerate(todisk):
2535
                        ind += 1
2536
                        index[ind] = output.tell()+indi
2537
                        output.write(zlib.compress(marshal.dumps(col,2)))
2538
                        index[ind+1] = output.tell()+indi
2539
                        if not exitGen:
2540
                            index[colnum+1] = 1
2541
                    output.seek(0)
2542
                    output.write(struct.pack(structHeader, *index))
2543
                    fileObject.write(output.getvalue())
2544
                    todisk = [[] for _ in xrange(len(schema))]
2545
                    bsize = 0
2546

    
2547
    def calclencols(mode):
2548
            if mode==RCFILE:
2549
                count = 0
2550
                bsize = 0
2551
                rows = []
2552
                try:
2553
                    while bsize<BLOCK_SIZE:
2554
                        row = diter.next()
2555
                        rows.append(row)
2556
                        count += 1
2557
                        bsize += sum((getSize(v) for v in row[1:]))
2558
                except StopIteration:
2559
                    pass
2560
                return count+10*count/100 , rows
2561
            if mode==SDC:
2562
                count = 0
2563
                bsize = 0
2564
                rows = []
2565
                try:
2566
                    while bsize<BLOCK_SIZE and count<65535:
2567
                        row = diter.next()
2568
                        rows.append(row)
2569
                        count += 1
2570
                        bsize += sum((getSize(v) for v in row[1:]))
2571

    
2572
                except StopIteration:
2573
                    pass
2574
                return count , rows
2575

    
2576

    
2577
    if mode == 'spac1':
2578
        if 'split' in formatArgs:
2579
            filesNum = int(formatArgs['split'])
2580
            filesList = [None]*filesNum
2581
            lencols , rows = calclencols()
2582
            for key in xrange(int(formatArgs['split'])) :
2583
                filesList[key] = open(os.path.join(fullpath, filename+'.'+str(key)), 'a')
2584

    
2585
            spacgen = [spac(x,lencols) for x in filesList]
2586
            spacgensend = [x.send for x in spacgen]
2587
            for j in spacgensend:
2588
                j(None)
2589
            for row in rows:
2590
                spacgensend[row[0]](row[1:])
2591
            del(rows)
2592
            for row in diter:
2593
                spacgensend[row[0]](row[1:])
2594
            for j in spacgen:
2595
                j.close()
2596
        else :
2597
            rcfilenonsplit()
2598

    
2599

    
2600
    if mode == 'sdc':
2601
        time1 = time.time()
2602
        if 'split' in formatArgs:
2603
            filesNum = int(formatArgs['split'])
2604
            filesList = [None]*filesNum
2605
            lencols , rows = calclencols(SDC)
2606
            for key in xrange(int(formatArgs['split'])) :
2607
                filesList[key] = open(os.path.join(fullpath, filename+'.'+str(key)), 'wb')
2608

    
2609

    
2610
            sdcgen = [sorteddictpercol(x,lencols,compression,level) for x in filesList]
2611
            sdcgensend = [x.send for x in sdcgen]
2612
            for j in sdcgensend:
2613
                j(None)
2614
            for row in rows:
2615
                sdcgensend[row[0]](row[1:])
2616
            del(rows)
2617
            for row in diter:
2618
                sdcgensend[row[0]](row[1:])
2619
            for j in sdcgen:
2620
                j.close()
2621
        time2 = time.time()
2622
        stats = open('compressionstatistics.tsv', 'a')
2623
        stat = "sdc_"+compression+str(level)
2624
        statstr = stat + "\t" + str(time2-time1) + "\t" + str(os.path.getsize(os.path.join(fullpath, filename+'.'+str(0)))) + "\n"
2625
        stats.write(statstr)
2626

    
2627
    
2628
    if mode == 'rcfile':
2629
        time1 = time.time()
2630
        if 'split' in formatArgs:
2631
            filesNum = int(formatArgs['split'])
2632
            filesList = [None]*filesNum
2633
            lencols , rows = calclencols(RCFILE)
2634
            for key in xrange(int(formatArgs['split'])) :
2635
                filesList[key] = open(os.path.join(fullpath, filename+'.'+str(key)), 'wb')
2636

    
2637

    
2638
            rcgen = [rcfile(x,lencols,compression,level) for x in filesList]
2639
            rcgensend = [x.send for x in rcgen]
2640
            for j in rcgensend:
2641
                j(None)
2642
            for row in rows:
2643
                rcgensend[row[0]](row[1:])
2644
            del(rows)
2645
            for row in diter:
2646
                rcgensend[row[0]](row[1:])
2647
            for j in rcgen:
2648
                j.close()
2649
        else :
2650
            rcfilenonsplit()
2651
        time2 = time.time()
2652
        stats = open('compressionstatistics.tsv', 'a')
2653
        stat = "rcfile_"+compression+str(level)
2654
        statstr = stat + "\t" + str(time2-time1) + "\t" + str(os.path.getsize(os.path.join(fullpath, filename+'.'+str(0)))) + "\n"
2655
        stats.write(statstr)
2656

    
2657

    
2658

    
2659
    if mode == 'itertools':
2660
        step = 1024
2661
        while True:
2662
            check = 0
2663
            for col in diter:
2664
                check = 1
2665
                break
2666
            if check == 0:
2667
                break
2668

    
2669
    if mode == 'deduplipax':
2670
        try:
2671
            colnum = len(schema)
2672
            cPickle.dump(schema, fileIter,1)
2673
            todisk = [[] for _ in xrange(colnum)]
2674
            start = 0
2675
            step = 1024
2676
            compr = zlib.BZ2Compressor(9)
2677
            currentblock = fileIter.tell()
2678
            myD=[{} for _ in xrange(colnum)]
2679
            bsize = 0
2680
            nditer = zip(*itertools.islice(diter, 0, step))
2681
            
2682
            while True:
2683
                i = 0
2684
                for col in nditer:
2685
                    bsize += sys.getsizeof(col)
2686
                    #for val in col:
2687
                    #    todisk[i].append(myD.setdefault(val,val))
2688
#                    todisk[i] += col
2689
                    todisk[i] += [myD[i].setdefault(val,val) for val in col]
2690
                    i+=1
2691
                if bsize>BLOCK_SIZE:
2692
                    output = cStringIO.StringIO()
2693
                    fastPickler = cPickle.Pickler(output, 1)
2694
                    prev = 0
2695
                    #fastPickler.fast = 1
2696
                    index_loc = currentblock + (colnum+1)*8
2697
                    index = [0 for _ in xrange(colnum+1)]
2698
                    ind = -1
2699
                    for col in todisk:
2700
                        ind += 1
2701
                        index[ind] = prev + index_loc
2702
                        output.write(compr.compress(cPickle.dumps(col)))
2703
                        prev = output.tell()
2704
                    index[ind+1] = prev + index_loc
2705
                    currentblock = prev+index_loc
2706
                    fileIter.write(struct.pack('L'*len(index), *index))
2707
                    fileIter.write(output.getvalue())
2708
                    todisk = [[] for _ in xrange(len(schema))]
2709
                    count = 0
2710
                    bsize = 0
2711
                if not nditer:
2712
                    output = cStringIO.StringIO()
2713
                    fastPickler = cPickle.Pickler(output, 1)
2714
                    prev = 0
2715
                    #fastPickler.fast = 1
2716
                    index_loc = currentblock + (colnum+1)*8
2717
                    index = [0 for _ in xrange(colnum+1)]
2718
                    ind = -1
2719
                    for col in todisk:
2720
                        ind += 1
2721
                        index[ind] = prev + index_loc
2722
                        output.write(zlib.compress(cPickle.dumps(col),9))
2723
                        prev = output.tell()
2724
                    index[ind+1] = prev + index_loc
2725
                    currentblock = prev+index_loc
2726
                    fileIter.write(struct.pack('L'*len(index), *index))
2727
                    fileIter.write(output.getvalue())
2728
                    todisk = [[] for _ in xrange(len(schema))]
2729
                    count = 0
2730
                    bsize = 0
2731
                    break
2732
                else:
2733
                    nditer = zip(*itertools.islice(diter, start, step))
2734
               
2735
            
2736
        except StopIteration,e:
2737
            
2738
            pass
2739

    
2740
    if mode == 'row':   # rowstore periptwsi
2741
        lencols,rows = calclencols()
2742
        cPickle.dump(schema,fileIter,1)
2743
        l = cStringIO.StringIO()
2744
        fastPickler = cPickle.Pickler(l, 1)
2745
        fastPickler.fast = 1
2746
        last = 0
2747

    
2748
        l.truncate(0)
2749
        fastPickler.dump(rows)
2750
        cz = zlib.compress(l.getvalue(),5)
2751
        s = len(cz)
2752
        fileIter.write(struct.pack('i', s))
2753
        fileIter.write(cz)
2754
        
2755
        while last == 0:
2756
            rows = list(itertools.islice(diter, 0, lencols))
2757
            if len(rows)<lencols:
2758
                last = 1
2759
            l.truncate(0)
2760
            fastPickler.dump(rows)
2761
            cz = zlib.compress(l.getvalue(),5)
2762
            s = len(cz)
2763
            fileIter.write(struct.pack('i', s))
2764
            fileIter.write(cz)
2765

    
2766

    
2767
    try:
2768
        if 'split' not in formatArgs:
2769
            fileIter.close()
2770
    except NameError:
2771
        pass
2772

    
2773
boolargs=lib.inoutparsing.boolargs+['compression']
2774

    
2775

    
2776
def Source():
2777
    global boolargs, nonstringargs
2778
    return SourceNtoOne(outputData,boolargs, lib.inoutparsing.nonstringargs,lib.inoutparsing.needsescape, connectionhandler=True)
2779

    
2780

    
2781
if not ('.' in __name__):
2782
    """
2783
    This is needed to be able to test the function, put it at the end of every
2784
    new function you create
2785
    """
2786
    import sys
2787
    import setpath
2788
    from functions import *
2789
    testfunction()
2790
    if __name__ == "__main__":
2791
        reload(sys)
2792
        sys.setdefaultencoding('utf-8')
2793
        import doctest
2794
        doctest.testmod()
(7-7/44)