Project

General

Profile

« Previous | Next » 

Revision 29482

introducing importer/plaintext/skip_extracted transformer required for plaintext import caching

View differences:

modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/importer/plaintext/skip_extracted/oozie_app/lib/scripts/transformer.pig
1
define avro_load_document_content
2
org.apache.pig.piggybank.storage.avro.AvroStorage(
3
'input_schema_class', '$schema_document_content');
4

  
5
define avro_load_document_text
6
org.apache.pig.piggybank.storage.avro.AvroStorage(
7
'input_schema_class', '$schema_document_text');
8

  
9
define avro_store_document_content
10
org.apache.pig.piggybank.storage.avro.AvroStorage(
11
'index', '0',
12
'output_schema_class', '$schema_document_content');
13

  
14
define avro_store_document_text
15
org.apache.pig.piggybank.storage.avro.AvroStorage(
16
'index', '1',
17
'output_schema_class', '$schema_document_text');
18

  
19
documentContent = load '$input_document_content' using avro_load_document_content;
20
documentText = load '$input_document_text' using avro_load_document_text;
21

  
22
documentTextId = foreach documentText generate id;
23

  
24
cachedDocumentIdDistinct = distinct documentTextId;
25

  
26
joinedDocumentContent = join documentContent by id left, cachedDocumentIdDistinct by id;
27
joinedFilteredDocumentContent = filter joinedDocumentContent by cachedDocumentIdDistinct::id is null;
28
documentContentFiltered = foreach joinedFilteredDocumentContent generate documentContent::id as id, documentContent::url as url, documentContent::mimeType as mimeType, documentContent::contentChecksum as contentChecksum;
29

  
30
documentContentId = foreach documentContent generate id;
31
documentContentIdDistinct = distinct documentContentId;
32

  
33
joinedDocumentText = join documentText by id, documentContentIdDistinct by id;
34
documentTextFiltered = foreach joinedDocumentText generate documentText::id as id, documentText::text as text;
35

  
36
store documentContentFiltered into '$output_document_content' using avro_store_document_content;
37
store documentTextFiltered into '$output_document_text' using avro_store_document_text;
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/importer/plaintext/skip_extracted/oozie_app/workflow.xml
1
<workflow-app xmlns="uri:oozie:workflow:0.4" name="transformers_importer_plaintext_skip_extracted">
2
	
3
	<parameters>
4
		<property>
5
			<name>input_document_content</name>
6
			<description>document content input</description>
7
		</property>
8
		<property>
9
			<name>input_document_text</name>
10
			<description>document text input</description>
11
		</property>
12
		<property>
13
			<name>output_document_content</name>
14
			<description>document content output: all contents which were not processed so far, based in input_document_meta inspection</description>
15
		</property>
16
		<property>
17
			<name>output_document_text</name>
18
			<description>document text ouput: all plaintext records which were already processed, found in input_document_text</description>
19
		</property>
20
	</parameters>
21
    
22
    <start to="transformer"/>
23
    <action name="transformer">
24
        <pig>
25
            <job-tracker>${jobTracker}</job-tracker>
26
            <name-node>${nameNode}</name-node>
27
			<!-- The data generated by this node is deleted in this section -->
28
			<prepare>
29
				<delete path="${nameNode}${workingDir}/transformer" />
30
				<delete path="${nameNode}${output_document_content}" />
31
				<delete path="${nameNode}${output_document_text}" />
32
				<mkdir path="${nameNode}${workingDir}/transformer" />
33
			</prepare>
34
            <configuration>
35
                <property>
36
                    <name>mapred.job.queue.name</name>
37
                    <value>${queueName}</value>
38
                </property>
39
                <property>
40
                    <name>mapred.map.child.java.opts</name>
41
                    <value>-Xmx4g</value>
42
                </property>
43
                <property>
44
                    <name>mapred.reduce.child.java.opts</name>
45
                    <value>-Xmx4g</value>
46
                </property>
47
            </configuration>
48
            <!-- Path to PIG script the workflow executes. -->
49
            <script>lib/scripts/transformer.pig</script>
50
            <!-- The working directory of the workflow node. -->
51
            <param>workingDir=${workingDir}/transformer/working_dir</param>
52
            
53
            <param>schema_document_content=eu.dnetlib.iis.importer.auxiliary.schemas.DocumentContentUrl</param>
54
            <param>schema_document_text=eu.dnetlib.iis.metadataextraction.schemas.DocumentText</param>
55
            
56
            <param>input_document_content=${input_document_content}</param>
57
            <param>input_document_text=${input_document_text}</param>
58
            
59
            <param>output_document_content=${output_document_content}</param>
60
            <param>output_document_text=${output_document_text}</param>
61
            
62
        </pig>
63
        <ok to="end"/>
64
        <error to="fail"/>
65
    </action>
66
    <kill name="fail">
67
		<message>Unfortunately, the workflow failed -- error message:
68
			[${wf:errorMessage(wf:lastErrorNode())}]</message>
69
    </kill>
70
    <end name="end"/>
71
</workflow-app>
0 72

  
modules/icm-iis-transformers/trunk/src/main/resources/eu/dnetlib/iis/transformers/importer/plaintext/skip_extracted/job.properties
1
input_document_content=/share/import/doc_content_url/europePMC/2014-06-19
2
input_document_text=/cache/plaintext/europePMC
3
output_document_content=${workingDir}/tobeprocessed_content
4
output_document_text=${workingDir}/tobereturned_plaintext
0 5

  

Also available in: Unified diff