Revision 29482
Added by Marek Horst almost 10 years ago
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/importer/plaintext/skip_extracted/oozie_app/lib/scripts/transformer.pig | ||
---|---|---|
1 |
define avro_load_document_content |
|
2 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
|
3 |
'input_schema_class', '$schema_document_content'); |
|
4 |
|
|
5 |
define avro_load_document_text |
|
6 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
|
7 |
'input_schema_class', '$schema_document_text'); |
|
8 |
|
|
9 |
define avro_store_document_content |
|
10 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
|
11 |
'index', '0', |
|
12 |
'output_schema_class', '$schema_document_content'); |
|
13 |
|
|
14 |
define avro_store_document_text |
|
15 |
org.apache.pig.piggybank.storage.avro.AvroStorage( |
|
16 |
'index', '1', |
|
17 |
'output_schema_class', '$schema_document_text'); |
|
18 |
|
|
19 |
documentContent = load '$input_document_content' using avro_load_document_content; |
|
20 |
documentText = load '$input_document_text' using avro_load_document_text; |
|
21 |
|
|
22 |
documentTextId = foreach documentText generate id; |
|
23 |
|
|
24 |
cachedDocumentIdDistinct = distinct documentTextId; |
|
25 |
|
|
26 |
joinedDocumentContent = join documentContent by id left, cachedDocumentIdDistinct by id; |
|
27 |
joinedFilteredDocumentContent = filter joinedDocumentContent by cachedDocumentIdDistinct::id is null; |
|
28 |
documentContentFiltered = foreach joinedFilteredDocumentContent generate documentContent::id as id, documentContent::url as url, documentContent::mimeType as mimeType, documentContent::contentChecksum as contentChecksum; |
|
29 |
|
|
30 |
documentContentId = foreach documentContent generate id; |
|
31 |
documentContentIdDistinct = distinct documentContentId; |
|
32 |
|
|
33 |
joinedDocumentText = join documentText by id, documentContentIdDistinct by id; |
|
34 |
documentTextFiltered = foreach joinedDocumentText generate documentText::id as id, documentText::text as text; |
|
35 |
|
|
36 |
store documentContentFiltered into '$output_document_content' using avro_store_document_content; |
|
37 |
store documentTextFiltered into '$output_document_text' using avro_store_document_text; |
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/importer/plaintext/skip_extracted/oozie_app/workflow.xml | ||
---|---|---|
1 |
<workflow-app xmlns="uri:oozie:workflow:0.4" name="transformers_importer_plaintext_skip_extracted"> |
|
2 |
|
|
3 |
<parameters> |
|
4 |
<property> |
|
5 |
<name>input_document_content</name> |
|
6 |
<description>document content input</description> |
|
7 |
</property> |
|
8 |
<property> |
|
9 |
<name>input_document_text</name> |
|
10 |
<description>document text input</description> |
|
11 |
</property> |
|
12 |
<property> |
|
13 |
<name>output_document_content</name> |
|
14 |
<description>document content output: all contents which were not processed so far, based in input_document_meta inspection</description> |
|
15 |
</property> |
|
16 |
<property> |
|
17 |
<name>output_document_text</name> |
|
18 |
<description>document text ouput: all plaintext records which were already processed, found in input_document_text</description> |
|
19 |
</property> |
|
20 |
</parameters> |
|
21 |
|
|
22 |
<start to="transformer"/> |
|
23 |
<action name="transformer"> |
|
24 |
<pig> |
|
25 |
<job-tracker>${jobTracker}</job-tracker> |
|
26 |
<name-node>${nameNode}</name-node> |
|
27 |
<!-- The data generated by this node is deleted in this section --> |
|
28 |
<prepare> |
|
29 |
<delete path="${nameNode}${workingDir}/transformer" /> |
|
30 |
<delete path="${nameNode}${output_document_content}" /> |
|
31 |
<delete path="${nameNode}${output_document_text}" /> |
|
32 |
<mkdir path="${nameNode}${workingDir}/transformer" /> |
|
33 |
</prepare> |
|
34 |
<configuration> |
|
35 |
<property> |
|
36 |
<name>mapred.job.queue.name</name> |
|
37 |
<value>${queueName}</value> |
|
38 |
</property> |
|
39 |
<property> |
|
40 |
<name>mapred.map.child.java.opts</name> |
|
41 |
<value>-Xmx4g</value> |
|
42 |
</property> |
|
43 |
<property> |
|
44 |
<name>mapred.reduce.child.java.opts</name> |
|
45 |
<value>-Xmx4g</value> |
|
46 |
</property> |
|
47 |
</configuration> |
|
48 |
<!-- Path to PIG script the workflow executes. --> |
|
49 |
<script>lib/scripts/transformer.pig</script> |
|
50 |
<!-- The working directory of the workflow node. --> |
|
51 |
<param>workingDir=${workingDir}/transformer/working_dir</param> |
|
52 |
|
|
53 |
<param>schema_document_content=eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</param> |
|
54 |
<param>schema_document_text=eu.dnetlib.iis.metadataextraction.schemas.DocumentText</param> |
|
55 |
|
|
56 |
<param>input_document_content=${input_document_content}</param> |
|
57 |
<param>input_document_text=${input_document_text}</param> |
|
58 |
|
|
59 |
<param>output_document_content=${output_document_content}</param> |
|
60 |
<param>output_document_text=${output_document_text}</param> |
|
61 |
|
|
62 |
</pig> |
|
63 |
<ok to="end"/> |
|
64 |
<error to="fail"/> |
|
65 |
</action> |
|
66 |
<kill name="fail"> |
|
67 |
<message>Unfortunately, the workflow failed -- error message: |
|
68 |
[${wf:errorMessage(wf:lastErrorNode())}]</message> |
|
69 |
</kill> |
|
70 |
<end name="end"/> |
|
71 |
</workflow-app> |
|
0 | 72 |
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/importer/plaintext/skip_extracted/job.properties | ||
---|---|---|
1 |
input_document_content=/share/import/doc_content_url/europePMC/2014-06-19 |
|
2 |
input_document_text=/cache/plaintext/europePMC |
|
3 |
output_document_content=${workingDir}/tobeprocessed_content |
|
4 |
output_document_text=${workingDir}/tobereturned_plaintext |
|
0 | 5 |
Also available in: Unified diff
introducing importer/plaintext/skip_extracted transformer required for plaintext import caching