1
|
define avro_load_document_content
|
2
|
org.apache.pig.piggybank.storage.avro.AvroStorage(
|
3
|
'schema', '$schema_document_content');
|
4
|
|
5
|
define avro_load_document_text
|
6
|
org.apache.pig.piggybank.storage.avro.AvroStorage(
|
7
|
'schema', '$schema_document_text');
|
8
|
|
9
|
define avro_store_document_content
|
10
|
org.apache.pig.piggybank.storage.avro.AvroStorage(
|
11
|
'index', '0',
|
12
|
'schema', '$schema_document_content');
|
13
|
|
14
|
define avro_store_document_text
|
15
|
org.apache.pig.piggybank.storage.avro.AvroStorage(
|
16
|
'index', '1',
|
17
|
'schema', '$schema_document_text');
|
18
|
|
19
|
documentContent = load '$input_document_content' using avro_load_document_content;
|
20
|
documentText = load '$input_document_text' using avro_load_document_text;
|
21
|
|
22
|
documentTextId = foreach documentText generate id;
|
23
|
|
24
|
cachedDocumentIdDistinct = distinct documentTextId;
|
25
|
|
26
|
joinedDocumentContent = join documentContent by id left, cachedDocumentIdDistinct by id;
|
27
|
joinedFilteredDocumentContent = filter joinedDocumentContent by cachedDocumentIdDistinct::id is null;
|
28
|
documentContentFiltered = foreach joinedFilteredDocumentContent generate documentContent::id as id, documentContent::url as url, documentContent::mimeType as mimeType, documentContent::contentChecksum as contentChecksum, documentContent::contentSizeKB as contentSizeKB;
|
29
|
|
30
|
documentContentId = foreach documentContent generate id;
|
31
|
documentContentIdDistinct = distinct documentContentId;
|
32
|
|
33
|
joinedDocumentText = join documentText by id, documentContentIdDistinct by id;
|
34
|
documentTextFiltered = foreach joinedDocumentText generate documentText::id as id, documentText::text as text;
|
35
|
|
36
|
store documentContentFiltered into '$output_document_content' using avro_store_document_content;
|
37
|
store documentTextFiltered into '$output_document_text' using avro_store_document_text;
|