Project

General

Profile

1
define avro_load_document_content
2
org.apache.pig.piggybank.storage.avro.AvroStorage(
3
'schema', '$schema_document_content');
4

    
5
define avro_load_document_text
6
org.apache.pig.piggybank.storage.avro.AvroStorage(
7
'schema', '$schema_document_text');
8

    
9
define avro_store_document_content
10
org.apache.pig.piggybank.storage.avro.AvroStorage(
11
'index', '0',
12
'schema', '$schema_document_content');
13

    
14
define avro_store_document_text
15
org.apache.pig.piggybank.storage.avro.AvroStorage(
16
'index', '1',
17
'schema', '$schema_document_text');
18

    
19
documentContent = load '$input_document_content' using avro_load_document_content;
20
documentText = load '$input_document_text' using avro_load_document_text;
21

    
22
documentTextId = foreach documentText generate id;
23

    
24
cachedDocumentIdDistinct = distinct documentTextId;
25

    
26
joinedDocumentContent = join documentContent by id left, cachedDocumentIdDistinct by id;
27
joinedFilteredDocumentContent = filter joinedDocumentContent by cachedDocumentIdDistinct::id is null;
28
documentContentFiltered = foreach joinedFilteredDocumentContent generate documentContent::id as id, documentContent::url as url, documentContent::mimeType as mimeType, documentContent::contentChecksum as contentChecksum, documentContent::contentSizeKB as contentSizeKB;
29

    
30
documentContentId = foreach documentContent generate id;
31
documentContentIdDistinct = distinct documentContentId;
32

    
33
joinedDocumentText = join documentText by id, documentContentIdDistinct by id;
34
documentTextFiltered = foreach joinedDocumentText generate documentText::id as id, documentText::text as text;
35

    
36
store documentContentFiltered into '$output_document_content' using avro_store_document_content;
37
store documentTextFiltered into '$output_document_text' using avro_store_document_text;
    (1-1/1)