Revision 42411
Added by Claudio Atzori over 8 years ago
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/coauthorUpdateJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="6d91b311-a7fd-48ff-98d2-1fed70850e3a_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="coauthorUpdateJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>update coauthors using a map {merged author id --> anchorId}</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
17 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
18 |
|
|
19 |
<!-- MAPPER --> |
|
20 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.gt.CoAuthorUpdateMapper" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Put" /> |
|
23 |
|
|
24 |
|
|
25 |
<!-- MISC --> |
|
26 |
<PROPERTY key="mapred.output.compress" value="false" /> |
|
27 |
<PROPERTY key="mapred.compress.map.output" value="true" /> |
|
28 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
29 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
30 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
31 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
32 |
|
|
33 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
34 |
|
|
35 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
36 |
|
|
37 |
<!-- Uncomment to override the default lib path --> |
|
38 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
39 |
</STATIC_CONFIGURATION> |
|
40 |
<JOB_INTERFACE> |
|
41 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
42 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
43 |
<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" /> |
|
44 |
</JOB_INTERFACE> |
|
45 |
<SCAN> |
|
46 |
<FILTERS operator="MUST_PASS_ALL"> |
|
47 |
<FILTER type="prefix" value="30"/> |
|
48 |
</FILTERS> |
|
49 |
<FAMILIES> |
|
50 |
<FAMILY value="person"/> |
|
51 |
</FAMILIES> |
|
52 |
</SCAN> |
|
53 |
</HADOOP_JOB> |
|
54 |
<STATUS> |
|
55 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
56 |
<RUNNING_INSTANCES value="0"/> |
|
57 |
<CUMULATIVE_RUN value="0" /> |
|
58 |
</STATUS> |
|
59 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
60 |
</BODY> |
|
61 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupDeleteSimRelsJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="6363b833-ac88-421c-8596-440a3dc735db_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="deleteSimRelJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that deletes the similarity rels used to in the deduplication process (person)</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupDeleteSimRelMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.Writable" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
25 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
28 |
|
|
29 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
30 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
31 |
|
|
32 |
<!-- Uncomment to override the default lib path --> |
|
33 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
34 |
</STATIC_CONFIGURATION> |
|
35 |
<JOB_INTERFACE> |
|
36 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
37 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
38 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
40 |
</JOB_INTERFACE> |
|
41 |
<SCAN> |
|
42 |
<FILTERS operator="MUST_PASS_ALL"> |
|
43 |
<FILTER type="prefix" param="entityTypeId" /> |
|
44 |
</FILTERS> |
|
45 |
<FAMILIES> |
|
46 |
<FAMILY param="entityType" /> |
|
47 |
<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" /> |
|
48 |
<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" /> |
|
49 |
<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" /> |
|
50 |
</FAMILIES> |
|
51 |
</SCAN> |
|
52 |
</HADOOP_JOB> |
|
53 |
<STATUS> |
|
54 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
55 |
<RUNNING_INSTANCES value="0"/> |
|
56 |
<CUMULATIVE_RUN value="0" /> |
|
57 |
</STATUS> |
|
58 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
59 |
</BODY> |
|
60 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/offlineHbaseLoadJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="073e55eb-c6f4-49a9-80b3-1a927612ba5b_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="offlineHbaseLoad" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that loads a given entity type in the offline dedup table</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.OfflineHbaseLoadMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.compress.map.output" value="true" /> |
|
25 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
28 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
29 |
|
|
30 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
31 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
32 |
|
|
33 |
<!-- Uncomment to override the default lib path --> |
|
34 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
35 |
</STATIC_CONFIGURATION> |
|
36 |
<JOB_INTERFACE> |
|
37 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
38 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
40 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
41 |
</JOB_INTERFACE> |
|
42 |
<SCAN> |
|
43 |
<FILTERS operator="MUST_PASS_ALL"> |
|
44 |
<FILTER type="prefix" param="entityTypeId" /> |
|
45 |
</FILTERS> |
|
46 |
<FAMILIES> |
|
47 |
<FAMILY param="entityType" /> |
|
48 |
</FAMILIES> |
|
49 |
</SCAN> |
|
50 |
</HADOOP_JOB> |
|
51 |
<STATUS> |
|
52 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
53 |
<RUNNING_INSTANCES value="0"/> |
|
54 |
<CUMULATIVE_RUN value="0" /> |
|
55 |
</STATUS> |
|
56 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
57 |
</BODY> |
|
58 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupPersonJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="29638605-235b-4cc1-9bf5-a5dd2fc84915_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupPersonJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that scans a given entity type and creates the similarRel graph</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.SimpleDedupPersonMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
|
|
23 |
<!-- REDUCER --> |
|
24 |
<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.SimpleDedupPersonReducer" /> |
|
25 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
26 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Writable" /> |
|
27 |
|
|
28 |
<!-- MISC --> |
|
29 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
30 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
31 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
32 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
33 |
|
|
34 |
<PROPERTY key="mapred.reduce.tasks" value="1000" /> |
|
35 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
36 |
|
|
37 |
<!-- Uncomment to override the default lib path --> |
|
38 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
39 |
</STATIC_CONFIGURATION> |
|
40 |
<JOB_INTERFACE> |
|
41 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
42 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
43 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
44 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
45 |
</JOB_INTERFACE> |
|
46 |
<SCAN> |
|
47 |
<FILTERS operator="MUST_PASS_ALL"> |
|
48 |
<FILTER type="prefix" param="entityTypeId" /> |
|
49 |
</FILTERS> |
|
50 |
<FAMILIES> |
|
51 |
<FAMILY param="entityType" /> |
|
52 |
</FAMILIES> |
|
53 |
</SCAN> |
|
54 |
</HADOOP_JOB> |
|
55 |
<STATUS> |
|
56 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
57 |
<RUNNING_INSTANCES value="0"/> |
|
58 |
<CUMULATIVE_RUN value="0" /> |
|
59 |
</STATUS> |
|
60 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
61 |
</BODY> |
|
62 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupSimilarity2ActionsJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="5c4b4dbf-8198-4f7a-9a35-367c7b0a7391_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupSimilarity2ActionsJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that scans a given entity type and creates the similarRel graph</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupSimilarityToActionsMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.compress.map.output" value="true" /> |
|
25 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
28 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
29 |
|
|
30 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
31 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
32 |
|
|
33 |
<!-- Uncomment to override the default lib path --> |
|
34 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
35 |
</STATIC_CONFIGURATION> |
|
36 |
<JOB_INTERFACE> |
|
37 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
38 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
40 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
41 |
<PARAM name="dedup.conf" required="true" description="dedup configuration" /> |
|
42 |
<PARAM name="rawSetId" required="true" description="raw set identifier" /> |
|
43 |
<PARAM name="similarityCF" required="true" description="similarity column family name" /> |
|
44 |
</JOB_INTERFACE> |
|
45 |
<SCAN> |
|
46 |
<FILTERS operator="MUST_PASS_ALL"> |
|
47 |
<FILTER type="prefix" param="entityTypeId" /> |
|
48 |
</FILTERS> |
|
49 |
<FAMILIES> |
|
50 |
<FAMILY param="entityType" /> |
|
51 |
<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" /> |
|
52 |
<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" /> |
|
53 |
<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" /> |
|
54 |
</FAMILIES> |
|
55 |
</SCAN> |
|
56 |
</HADOOP_JOB> |
|
57 |
<STATUS> |
|
58 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
59 |
<RUNNING_INSTANCES value="0"/> |
|
60 |
<CUMULATIVE_RUN value="0" /> |
|
61 |
</STATUS> |
|
62 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
63 |
</BODY> |
|
64 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/indexFeedJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="1c34963b-75b3-4440-9f42-72445a26c077_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="indexFeedJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that joins the entities on the hbase table and produces a sequence file containig the xml records</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.index.IndexFeedMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" /> |
|
22 |
|
|
23 |
<!-- JOB GLOBAL --> |
|
24 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" /> |
|
25 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text"/> |
|
26 |
|
|
27 |
<!-- MISC --> |
|
28 |
<PROPERTY key="mapred.task.timeout" value="1800000"/> |
|
29 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
30 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
31 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
32 |
<PROPERTY key="mapred.fairscheduler.pool" value="solr"/> |
|
33 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
34 |
|
|
35 |
<!-- Uncomment to override the default lib path --> |
|
36 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
37 |
</STATIC_CONFIGURATION> |
|
38 |
<JOB_INTERFACE> |
|
39 |
<PARAM name="mapred.input.dir" required="true" description="source sequence file on hdfs" /> |
|
40 |
<PARAM name="mapred.output.dir" required="true" description="destination path on hdfs for rotten index xml records" /> |
|
41 |
|
|
42 |
<PARAM name="index.solr.url" required="false" description="url used to instantiate the solr client" /> |
|
43 |
<PARAM name="index.solr.collection" required="true" description="target solr collection to be fed" /> |
|
44 |
|
|
45 |
<PARAM name="id" required="true" description="index DS id" /> |
|
46 |
<PARAM name="index.shutdown.wait.time" required="true" description="wait time before shut down the solr client pool" /> |
|
47 |
<PARAM name="index.buffer.flush.threshold" required="true" description="indexing buffer flush threshold" /> |
|
48 |
<PARAM name="index.feed.timestamp" required="true" description="timestamp used as ds_version" /> |
|
49 |
<PARAM name="index.solr.sim.mode" required="true" description="boolean value, allows to run this job in simulation mode" /> |
|
50 |
<PARAM name="index.xslt" required="true" description="record transformer created by the MSRO service" /> |
|
51 |
</JOB_INTERFACE> |
|
52 |
<SCAN> |
|
53 |
<FILTERS /> |
|
54 |
<FAMILIES /> |
|
55 |
</SCAN> |
|
56 |
</HADOOP_JOB> |
|
57 |
<STATUS> |
|
58 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
59 |
<RUNNING_INSTANCES value="0"/> |
|
60 |
<CUMULATIVE_RUN value="0" /> |
|
61 |
</STATUS> |
|
62 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
63 |
</BODY> |
|
64 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupGrouperJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="6b2d8db3-346f-4ddc-8591-39fd488c1191_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupGrouperJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map only job that closes the similarity mesh</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupGrouperMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Put" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
25 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
28 |
|
|
29 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
30 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
31 |
|
|
32 |
<!-- Uncomment to override the default lib path --> |
|
33 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
34 |
</STATIC_CONFIGURATION> |
|
35 |
<JOB_INTERFACE> |
|
36 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
37 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
38 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
40 |
</JOB_INTERFACE> |
|
41 |
<SCAN> |
|
42 |
<FILTERS operator="MUST_PASS_ALL"> |
|
43 |
<FILTER type="prefix" param="entityTypeId" /> |
|
44 |
</FILTERS> |
|
45 |
<FAMILIES> |
|
46 |
<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" /> |
|
47 |
<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" /> |
|
48 |
<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" /> |
|
49 |
</FAMILIES> |
|
50 |
</SCAN> |
|
51 |
</HADOOP_JOB> |
|
52 |
<STATUS> |
|
53 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
54 |
<RUNNING_INSTANCES value="0"/> |
|
55 |
<CUMULATIVE_RUN value="0" /> |
|
56 |
</STATUS> |
|
57 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
58 |
</BODY> |
|
59 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" ?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-openaireplus-profiles</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>1.0.11</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11</developerConnection> |
|
16 |
</scm> |
|
17 |
|
|
18 |
<dependencies> |
|
19 |
|
|
20 |
</dependencies> |
|
21 |
</project> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/oaiFeedJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="03d7af20-63bb-4790-a052-6cdbc1e05fce_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2015-02-09T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="oaiFeedJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that feeds the OAI store</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.NullOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.oai.OaiFeedMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.NullWritable" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.NullWritable" /> |
|
22 |
|
|
23 |
<!-- JOB GLOBAL --> |
|
24 |
<!-- <PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.NullWritable" /> --> |
|
25 |
<!-- <PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.NullWritable"/> --> |
|
26 |
|
|
27 |
<!-- MISC --> |
|
28 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
29 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
30 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
31 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
32 |
|
|
33 |
<!-- Uncomment to override the default lib path --> |
|
34 |
<PROPERTY key="job.lib" value="/user/dnet/lib/dnet-mapreduce-jobs-assembly-0.0.6.3-SNAPSHOT.jar"/> |
|
35 |
</STATIC_CONFIGURATION> |
|
36 |
<JOB_INTERFACE> |
|
37 |
<PARAM name="mapred.input.dir" required="true" description="source sequence file on hdfs" /> |
|
38 |
<PARAM name="services.publisher.oai.collection" required="true" description="target mongodb collection" /> |
|
39 |
<PARAM name="oaiConfiguration" required="true" description="configuration bean used to guide the OAI feeding" /> |
|
40 |
<PARAM name="oai.feed.date" required="true" description="timestamp" /> |
|
41 |
<PARAM name="services.publisher.oai.host" required="true" description="mongodb host" /> |
|
42 |
<PARAM name="services.publisher.oai.port" required="true" description="mongodb port" /> |
|
43 |
<PARAM name="services.publisher.oai.db" required="true" description="mongodb database name" /> |
|
44 |
<PARAM name="services.publisher.oai.skipDuplicates" required="true" description="skip duplicated records." /> |
|
45 |
<PARAM name="services.publisher.oai.duplicateXPath" required="true" description="records with this xpath are identified as duplicates" /> |
|
46 |
</JOB_INTERFACE> |
|
47 |
<SCAN> |
|
48 |
<FILTERS /> |
|
49 |
<FAMILIES /> |
|
50 |
</SCAN> |
|
51 |
</HADOOP_JOB> |
|
52 |
<STATUS> |
|
53 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
54 |
<RUNNING_INSTANCES value="0"/> |
|
55 |
<CUMULATIVE_RUN value="0" /> |
|
56 |
</STATUS> |
|
57 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
58 |
</BODY> |
|
59 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupExportPersonFullnameJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="ba309300-76f2-40d1-afe3-b77016f443e9_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="exportPersonFullnamesJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that exports the person fullnames on a text file on HDFS</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.preprocess.ExportFullnameMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" /> |
|
22 |
|
|
23 |
<!-- REDUCER --> |
|
24 |
<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.preprocess.ExportFullnameReducer" /> |
|
25 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" /> |
|
26 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" /> |
|
27 |
|
|
28 |
<!-- MISC --> |
|
29 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
30 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
31 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
32 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
33 |
|
|
34 |
<PROPERTY key="mapred.reduce.tasks" value="1" /> |
|
35 |
<PROPERTY key="dfs.blocksize" value="256M" /> |
|
36 |
<PROPERTY key="mapred.textoutputformat.separator" value="" /> |
|
37 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
38 |
|
|
39 |
<!-- Uncomment to override the default lib path --> |
|
40 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
41 |
</STATIC_CONFIGURATION> |
|
42 |
<JOB_INTERFACE> |
|
43 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
44 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
45 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
46 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
47 |
</JOB_INTERFACE> |
|
48 |
<SCAN caching="10"> |
|
49 |
<FILTERS operator="MUST_PASS_ALL"> |
|
50 |
<FILTER type="prefix" param="entityTypeId" /> |
|
51 |
</FILTERS> |
|
52 |
<FAMILIES/> |
|
53 |
</SCAN> |
|
54 |
</HADOOP_JOB> |
|
55 |
<STATUS> |
|
56 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
57 |
<RUNNING_INSTANCES value="0"/> |
|
58 |
<CUMULATIVE_RUN value="0" /> |
|
59 |
</STATUS> |
|
60 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
61 |
</BODY> |
|
62 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/iisPreprocessingQuickJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="13beed98-81bf-4fbd-ab4f-de071177997c_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="iisPreprocessingQuickJob" type="oozie"> |
|
11 |
<DESCRIPTION>IIS preprocessing</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
<!-- Cluster wide --> |
|
14 |
<PROPERTY key="queueName" value="default"/> |
|
15 |
<PROPERTY key="user.name" value="dnet" /> |
|
16 |
|
|
17 |
<!-- Runtime --> |
|
18 |
<PROPERTY key="workingDir" value="/tmp/integration/working_dir/preprocessing_quick_test"/> |
|
19 |
<PROPERTY key="oozie.wf.validate.ForkJoin" value="false"/> |
|
20 |
<PROPERTY key="input_referenceextraction_project" value="/user/marek.horst/share/referenceextraction/document_projects/2014-04-11"/> |
|
21 |
<PROPERTY key="input_referenceextraction_dataset" value="/user/marek.horst/share/referenceextraction/document_datasets/all/2014-04-11"/> |
|
22 |
<PROPERTY key="export_action_hbase_table_initialize" value="false"/> |
|
23 |
</STATIC_CONFIGURATION> |
|
24 |
<JOB_INTERFACE> |
|
25 |
<PARAM name="import_content_object_store_location" required="true" description="mdstore service location" /> |
|
26 |
<PARAM name="import_mdstore_service_location" required="true" description="mdstore service location" /> |
|
27 |
<PARAM name="import_dataset_mdstore_ids_csv" required="true" description="mdstore id for dataset records" /> |
|
28 |
<PARAM name="import_wos_mdstore_id" required="true" description="mdstore id for WoS records" /> |
|
29 |
<PARAM name="import_database_service_location" required="true" description="database service endpoint" /> |
|
30 |
<PARAM name="import_content_datacite_objectstores_csv" required="true" description="objecstore ids subject to dataset reference extraction" /> |
|
31 |
<PARAM name="import_content_wos_plaintext_objectstores_csv" required="true" description="objecstore ids for WoS plaintext" /> |
|
32 |
<PARAM name="export_action_hbase_table_name" required="true" description="destination action manager table" /> |
|
33 |
<PARAM name="export_action_hbase_remote_zookeeper_quorum" required="true" description="ZK quorum" /> |
|
34 |
<PARAM name="export_action_hbase_remote_zookeeper_clientport" required="true" description="ZK port" /> |
|
35 |
<PARAM name="nameNode" required="true" description="hdfs name node" /> |
|
36 |
<PARAM name="jobTracker" required="true" description="job tracker name" /> |
|
37 |
<PARAM name="oozie.wf.application.path" required="true" description="oozie job application absolute path" /> |
|
38 |
<PARAM name="export_action_set_id_document_referencedProjects" required="true" description="target action set for project references" /> |
|
39 |
<PARAM name="export_action_set_id_document_referencedDatasets" required="true" description="target action set for dataset references" /> |
|
40 |
<PARAM name="export_action_set_id_entity_wos" required="true" description="target action set for WoS entities" /> |
|
41 |
<PARAM name="export_action_set_id_entity_dataset" required="true" description="target action set for dataset entities" /> |
|
42 |
</JOB_INTERFACE> |
|
43 |
</HADOOP_JOB> |
|
44 |
<STATUS> |
|
45 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
46 |
<RUNNING_INSTANCES value="0"/> |
|
47 |
<CUMULATIVE_RUN value="0" /> |
|
48 |
</STATUS> |
|
49 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
50 |
</BODY> |
|
51 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupRootsPersonExportJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="4c63a9ab-057f-442c-8da2-9b956c41e645_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupRootsPersonExportJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map only job that exports the representative publications as json</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.gt.RootPersonExportMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" /> |
|
22 |
|
|
23 |
|
|
24 |
<!-- MISC --> |
|
25 |
|
|
26 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
27 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
28 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
29 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
30 |
|
|
31 |
<PROPERTY key="dfs.blocksize" value="256M" /> |
|
32 |
|
|
33 |
<PROPERTY key="mapred.reduce.tasks" value="1" /> |
|
34 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
35 |
|
|
36 |
<!-- Uncomment to override the default lib path --> |
|
37 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
38 |
</STATIC_CONFIGURATION> |
|
39 |
<JOB_INTERFACE> |
|
40 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
41 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
42 |
|
|
43 |
<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" /> |
|
44 |
</JOB_INTERFACE> |
|
45 |
<SCAN> |
|
46 |
<FILTERS operator="MUST_PASS_ALL"> |
|
47 |
<FILTER type="prefix" param="entityTypeId" /> |
|
48 |
</FILTERS> |
|
49 |
<FAMILIES> |
|
50 |
<FAMILY param="entityType" /> |
|
51 |
</FAMILIES> |
|
52 |
</SCAN> |
|
53 |
</HADOOP_JOB> |
|
54 |
<STATUS> |
|
55 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
56 |
<RUNNING_INSTANCES value="0"/> |
|
57 |
<CUMULATIVE_RUN value="0" /> |
|
58 |
</STATUS> |
|
59 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
60 |
</BODY> |
|
61 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/resetDedupJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="bc4f377a-af07-403d-a019-af60aa557652_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="resetDedupJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map only job that reset the dedup jobs</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.reset.HBaseResetMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Mutation" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
25 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
26 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
27 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
28 |
|
|
29 |
<!-- Uncomment to override the default lib path --> |
|
30 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
31 |
|
|
32 |
</STATIC_CONFIGURATION> |
|
33 |
<JOB_INTERFACE> |
|
34 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
35 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
36 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
37 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
38 |
</JOB_INTERFACE> |
|
39 |
</HADOOP_JOB> |
|
40 |
<STATUS> |
|
41 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
42 |
<RUNNING_INSTANCES value="0"/> |
|
43 |
<CUMULATIVE_RUN value="0" /> |
|
44 |
</STATUS> |
|
45 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
46 |
</BODY> |
|
47 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupGTCleanerJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="888ef72f-701a-4d59-8b8a-2ad01986f975_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="gtCleanerJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that deletes the non-GT rows</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.gt.GTCleanerMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Delete" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
25 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
28 |
|
|
29 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
30 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
31 |
|
|
32 |
<!-- Uncomment to override the default lib path --> |
|
33 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
34 |
</STATIC_CONFIGURATION> |
|
35 |
<JOB_INTERFACE> |
|
36 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
37 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
38 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
40 |
</JOB_INTERFACE> |
|
41 |
<SCAN> |
|
42 |
<FILTERS operator="MUST_PASS_ALL"> |
|
43 |
<FILTER type="prefix" param="entityTypeId" /> |
|
44 |
</FILTERS> |
|
45 |
<FAMILIES> |
|
46 |
<FAMILY param="entityType" /> |
|
47 |
<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" /> |
|
48 |
<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" /> |
|
49 |
<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" /> |
|
50 |
</FAMILIES> |
|
51 |
</SCAN> |
|
52 |
</HADOOP_JOB> |
|
53 |
<STATUS> |
|
54 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
55 |
<RUNNING_INSTANCES value="0"/> |
|
56 |
<CUMULATIVE_RUN value="0" /> |
|
57 |
</STATUS> |
|
58 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
59 |
</BODY> |
|
60 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/elasticsearchTestJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="719b5d2b-4156-4936-bbc3-41d908ec3c57_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="elastisearchTestJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map only job that indexes over ES</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.elasticsearch.hadoop.mr.EsOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.es.ElasticsearchFeedMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.NullWritable" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.BytesWritable" /> |
|
22 |
|
|
23 |
|
|
24 |
<!-- MISC --> |
|
25 |
<PROPERTY key="es.nodes" value="146.48.87.110:9200" /> |
|
26 |
<PROPERTY key="es.resource" value="openaire/oaf" /> |
|
27 |
<PROPERTY key="es.input.json" value="yes" /> |
|
28 |
|
|
29 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
30 |
|
|
31 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
32 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
33 |
|
|
34 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
35 |
|
|
36 |
<!-- Uncomment to override the default lib path --> |
|
37 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
38 |
</STATIC_CONFIGURATION> |
|
39 |
<JOB_INTERFACE> |
|
40 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
41 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
42 |
|
|
43 |
<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" /> |
|
44 |
</JOB_INTERFACE> |
|
45 |
<SCAN> |
|
46 |
<FILTERS /> |
|
47 |
<FAMILIES /> |
|
48 |
</SCAN> |
|
49 |
</HADOOP_JOB> |
|
50 |
<STATUS> |
|
51 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
52 |
<RUNNING_INSTANCES value="0"/> |
|
53 |
<CUMULATIVE_RUN value="0" /> |
|
54 |
</STATUS> |
|
55 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
56 |
</BODY> |
|
57 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupRootsToCSVJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="027554bd-3d5c-4c50-9170-90d8c4402bc3_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupRootsToCSVJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map only job that exports the representatives as CSV files</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat" /> |
|
17 |
<PROPERTY key="mapreduce.output.lazyoutputformat.outputformat" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
18 |
|
|
19 |
<!-- MAPPER --> |
|
20 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvMapper" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
23 |
|
|
24 |
<!-- REDUCER --> |
|
25 |
<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvReducer" /> |
|
26 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" /> |
|
27 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" /> |
|
28 |
|
|
29 |
<!-- MUTIPLE OUTPUT --> |
|
30 |
<PROPERTY key="mapreduce.multipleoutputs" value="NativeGroups Groups NativeEntities" /> |
|
31 |
|
|
32 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
33 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.key" value="org.apache.hadoop.io.Text" /> |
|
34 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.value" value="org.apache.hadoop.io.Text" /> |
|
35 |
|
|
36 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
37 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.key" value="org.apache.hadoop.io.Text" /> |
|
38 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.value" value="org.apache.hadoop.io.Text" /> |
|
39 |
|
|
40 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
41 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.key" value="org.apache.hadoop.io.Text" /> |
|
42 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.value" value="org.apache.hadoop.io.Text" /> |
|
43 |
|
|
44 |
<!-- MISC --> |
|
45 |
|
|
46 |
<PROPERTY key="mapred.textoutputformat.wrapper" value="#"/> |
|
47 |
<PROPERTY key="mapred.textoutputformat.separator" value="!"/> |
|
48 |
|
|
49 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
50 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
51 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
52 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
53 |
|
|
54 |
<PROPERTY key="mapred.reduce.tasks" value="3" /> |
|
55 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
56 |
|
|
57 |
<!-- Uncomment to override the default lib path --> |
|
58 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
59 |
</STATIC_CONFIGURATION> |
|
60 |
<JOB_INTERFACE> |
|
61 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
62 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
63 |
|
|
64 |
<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" /> |
|
65 |
</JOB_INTERFACE> |
|
66 |
<SCAN> |
|
67 |
<FILTERS operator="MUST_PASS_ALL"> |
|
68 |
<FILTER type="prefix" param="entityTypeId" /> |
|
69 |
</FILTERS> |
|
70 |
<FAMILIES> |
|
71 |
<FAMILY param="entityType" /> |
|
72 |
<FAMILY value="resultResult_dedup_merges" /> |
|
73 |
<FAMILY value="personPerson_dedup_merges" /> |
|
74 |
<FAMILY value="organizationOrganization_dedup_merges" /> |
|
75 |
</FAMILIES> |
|
76 |
</SCAN> |
|
77 |
</HADOOP_JOB> |
|
78 |
<STATUS> |
|
79 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
80 |
<RUNNING_INSTANCES value="0"/> |
|
81 |
<CUMULATIVE_RUN value="0" /> |
|
82 |
</STATUS> |
|
83 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
84 |
</BODY> |
|
85 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.step.01.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8baf_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<DESCRIPTION>1 - Publication: Match against the title, whose numbers must match</DESCRIPTION> |
|
12 |
<DEDUPLICATION> |
|
13 |
{ |
|
14 |
"wf" : { |
|
15 |
"threshold" : "0.99", |
|
16 |
"dedupRun" : "001", |
|
17 |
"entityType" : "result", |
|
18 |
"orderField" : "title", |
|
19 |
"queueMaxSize" : "2000", |
|
20 |
"groupMaxSize" : "10", |
|
21 |
"slidingWindowSize" : "200", |
|
22 |
"rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ], |
|
23 |
"includeChildren" : "true" |
|
24 |
}, |
|
25 |
"pace" : { |
|
26 |
"clustering" : [ |
|
27 |
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, |
|
28 |
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } |
|
29 |
], |
|
30 |
"conditions" : [ |
|
31 |
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }, |
|
32 |
{ "name" : "sizeMatch", "fields" : [ "authors" ] } |
|
33 |
], |
|
34 |
"model" : [ |
|
35 |
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title/value" }, |
|
36 |
{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } |
|
37 |
], |
|
38 |
"blacklists" : { |
|
39 |
"title" : [ |
|
40 |
|
|
41 |
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", |
|
42 |
"^Problems with perinatal pathology\.?$", |
|
43 |
|
|
44 |
"(?i)^Cases? of Puerperal Convulsions$", |
|
45 |
"(?i)^Operative Gyna?ecology$", |
|
46 |
"(?i)^Mind the gap\!?\:?$", |
|
47 |
"^Chronic fatigue syndrome\.?$", |
|
48 |
"^Cartas? ao editor Letters? to the Editor$", |
|
49 |
"^Note from the Editor$", |
|
50 |
"^Anesthesia Abstract$", |
|
51 |
|
|
52 |
"^Annual report$", |
|
53 |
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$", |
|
54 |
"(?i)^Graph and Table of Infectious Diseases?$", |
|
55 |
"^Presentation$", |
|
56 |
"(?i)^Reviews and Information on Publications$", |
|
57 |
"(?i)^PUBLIC HEALTH SERVICES?$", |
|
58 |
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", |
|
59 |
"(?i)^Adrese autora$", |
|
60 |
"(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", |
|
61 |
"(?i)^Acknowledgement to Referees$", |
|
62 |
"(?i)^Behçet's disease\.?$", |
|
63 |
"(?i)^Isolation and identification of restriction endonuclease.*$", |
|
64 |
"(?i)^CEREBROVASCULAR DISEASES?.?$", |
|
65 |
"(?i)^Screening for abdominal aortic aneurysms?\.?$", |
|
66 |
"^Event management$", |
|
67 |
"(?i)^Breakfast and Crohn's disease.*\.?$", |
|
68 |
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$", |
|
69 |
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$", |
|
70 |
"^Gushi hakubutsugaku$", |
|
71 |
|
|
72 |
"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$", |
|
73 |
"^Intestinal spirocha?etosis$", |
|
74 |
"^Treatment of Rodent Ulcer$", |
|
75 |
"(?i)^\W*Cloud Computing\W*$", |
|
76 |
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", |
|
77 |
"^Free Communications, Poster Presentations: Session [A-F]$", |
|
78 |
|
|
79 |
"^“The Historical Aspects? of Quackery\.?”$", |
|
80 |
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", |
|
81 |
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$", |
|
82 |
"(?i)^Case Report$", |
|
83 |
"^Boletín Informativo$", |
|
84 |
"(?i)^Glioblastoma Multiforme$", |
|
85 |
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", |
|
86 |
"^Zaměstnanecké výhody$", |
|
87 |
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", |
|
88 |
"(?i)^Carotid body tumours?\\.?$", |
|
89 |
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", |
|
90 |
"^Avant-propos$", |
|
91 |
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", |
|
92 |
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", |
|
93 |
"(?i)^PUBLIC HEALTH VERSUS THE STATE$", |
|
94 |
"^Viñetas de Cortázar$", |
|
95 |
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$", |
|
96 |
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$", |
|
97 |
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", |
|
98 |
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", |
|
99 |
|
|
100 |
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", |
|
101 |
"^Aus der AGMB$", |
|
102 |
|
|
103 |
"^Znanstveno-stručni prilozi$", |
|
104 |
"^Zhodnocení finanční situace podniku a návrhy na zlepšení$", |
|
105 |
"^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", |
|
106 |
"^Finanční analýza podniku$", |
|
107 |
"^Financial analysis( of business)?$", |
|
108 |
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", |
|
109 |
"^Jikken nihon shūshinsho$", |
|
110 |
"(?i)^CORONER('|s)(s|') INQUESTS$", |
|
111 |
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", |
|
112 |
"(?i)^Consultants' contract(s)?$", |
|
113 |
"(?i)^Upute autorima$", |
|
114 |
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", |
|
115 |
"^Joshi shin kokubun$", |
|
116 |
"^Kōtō shōgaku dokuhon nōson'yō$", |
|
117 |
"^Jinjō shōgaku shōka$", |
|
118 |
"^Shōgaku shūjichō$", |
|
119 |
"^Nihon joshi dokuhon$", |
|
120 |
"^Joshi shin dokuhon$", |
|
121 |
"^Chūtō kanbun dokuhon$", |
|
122 |
"^Wabun dokuhon$", |
|
123 |
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", |
|
124 |
"(?i)^cardiac rehabilitation$", |
|
125 |
"(?i)^Analytical summary$", |
|
126 |
"^Thesaurus resolutionum Sacrae Congregationis Concilii$", |
|
127 |
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", |
|
128 |
"^Prikazi i osvrti$", |
|
129 |
"^Rodinný dům s provozovnou$", |
|
130 |
"^Family house with an establishment$", |
|
131 |
"^Shinsei chūtō shin kokugun$", |
|
132 |
"^Pulmonary alveolar proteinosis(\\.?)$", |
|
133 |
"^Shinshū kanbun$", |
|
134 |
"^Viñeta(s?) de Rodríguez$", |
|
135 |
"(?i)^RUBRIKA UREDNIKA$", |
|
136 |
"^A Matching Model of the Academic Publication Market$", |
|
137 |
"^Yōgaku kōyō$", |
|
138 |
|
|
139 |
"^Internetový marketing$", |
|
140 |
"^Internet marketing$", |
|
141 |
"^Chūtō kokugo dokuhon$", |
|
142 |
"^Kokugo dokuhon$", |
|
143 |
"^Antibiotic Cover for Dental Extraction(s?)$", |
|
144 |
"^Strategie podniku$", |
|
145 |
"^Strategy of an Enterprise$", |
|
146 |
"(?i)^respiratory disease(s?)(\.?)$", |
|
147 |
"^Award(s?) for Gallantry in Civil Defence$", |
|
148 |
"^Podniková kultura$", |
|
149 |
"^Corporate Culture$", |
|
150 |
"^Severe hyponatraemia in hospital inpatient(s?)(\.?)$", |
|
151 |
"^Pracovní motivace$", |
|
152 |
"^Work Motivation$", |
|
153 |
"^Kaitei kōtō jogaku dokuhon$", |
|
154 |
"^Konsolidovaná účetní závěrka$", |
|
155 |
"^Consolidated Financial Statements$", |
|
156 |
"(?i)^intracranial tumour(s?)$", |
|
157 |
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", |
|
158 |
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", |
|
159 |
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", |
|
160 |
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$", |
|
161 |
"^Úroveň motivačního procesu jako způsobu vedení lidí$", |
|
162 |
"^The level of motivation process as a leadership$", |
|
163 |
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", |
|
164 |
"(?i)^news and events$", |
|
165 |
"(?i)^NOVOSTI I DOGAĐAJI$", |
|
166 |
"^Sansū no gakushū$", |
|
167 |
"^Posouzení informačního systému firmy a návrh změn$", |
|
168 |
"^Information System Assessment and Proposal for ICT Modification$", |
|
169 |
"^Stresové zatížení pracovníků ve vybrané profesi$", |
|
170 |
"^Stress load in a specific job$", |
|
171 |
|
|
172 |
"^Sunday: Poster Sessions, Pt.*$", |
|
173 |
"^Monday: Poster Sessions, Pt.*$", |
|
174 |
"^Wednesday: Poster Sessions, Pt.*", |
|
175 |
"^Tuesday: Poster Sessions, Pt.*$", |
|
176 |
|
|
177 |
"^Analýza reklamy$", |
|
178 |
"^Analysis of advertising$", |
|
179 |
|
|
180 |
"^Shōgaku shūshinsho$", |
|
181 |
"^Shōgaku sansū$", |
|
182 |
"^Shintei joshi kokubun$", |
|
183 |
"^Taishō joshi kokubun dokuhon$", |
|
184 |
"^Joshi kokubun$", |
|
185 |
|
|
186 |
"^Účetní uzávěrka a účetní závěrka v ČR$", |
|
187 |
"(?i)^The \"?Causes\"? of Cancer$", |
|
188 |
"^Normas para la publicación de artículos$", |
|
189 |
"^Editor('|s)(s|') [Rr]eply$", |
|
190 |
"^Editor(’|s)(s|’) letter$", |
|
191 |
"^Redaktoriaus žodis$", |
|
192 |
"^DISCUSSION ON THE PRECEDING PAPER$", |
|
193 |
"^Kōtō shōgaku shūshinsho jidōyō$", |
|
194 |
"^Shōgaku nihon rekishi$", |
|
195 |
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", |
|
196 |
"^Préface$", |
|
197 |
"^Occupational [Hh]ealth [Ss]ervices.$", |
|
198 |
"^In Memoriam Professor Toshiyuki TAKESHIMA$", |
|
199 |
"^Účetní závěrka ve vybraném podniku.*$", |
|
200 |
"^Financial statements in selected company$", |
|
201 |
"^Abdominal [Aa]ortic [Aa]neurysms.*$", |
|
202 |
"^Pseudomyxoma peritonei$", |
|
203 |
"^Kazalo autora$", |
|
204 |
|
|
205 |
"(?i)^uvodna riječ$", |
|
206 |
"^Motivace jako způsob vedení lidí$", |
|
207 |
"^Motivation as a leadership$", |
|
208 |
"^Polyfunkční dům$", |
|
209 |
"^Multi\\-funkcional building$", |
|
210 |
"^Podnikatelský plán$", |
|
211 |
"^Business Plan$", |
|
212 |
"^Oceňování nemovitostí$", |
|
213 |
"^Marketingová komunikace$", |
|
214 |
"^Marketing communication$", |
|
215 |
"^Sumario Analítico$", |
|
216 |
"^Riječ uredništva$", |
|
217 |
"^Savjetovanja i priredbe$", |
|
218 |
"^Índice$", |
|
219 |
"^(Starobosanski nadpisi).*$", |
|
220 |
"^Vzdělávání pracovníků v organizaci$", |
|
221 |
"^Staff training in organization$", |
|
222 |
"^(Life Histories of North American Geometridae).*$", |
|
223 |
"^Strategická analýza podniku$", |
|
224 |
"^Strategic Analysis of an Enterprise$", |
|
225 |
"^Sadržaj$", |
|
226 |
"^Upute suradnicima$", |
|
227 |
"^Rodinný dům$", |
|
228 |
"(?i)^Fami(l)?ly house$", |
|
229 |
"^Upute autorima$", |
|
230 |
"^Strategic Analysis$", |
|
231 |
"^Finanční analýza vybraného podniku$", |
|
232 |
"^Finanční analýza$", |
|
233 |
"^Riječ urednika$", |
|
234 |
"(?i)^Content(s?)$", |
|
235 |
"(?i)^Inhalt$", |
|
236 |
"^Jinjō shōgaku shūshinsho jidōyō$", |
|
237 |
"(?i)^Index$", |
|
238 |
"^Chūgaku kokubun kyōkasho$", |
|
239 |
"^Retrato de una mujer$", |
|
240 |
"^Retrato de un hombre$", |
|
241 |
"^Kōtō shōgaku dokuhon$", |
|
242 |
"^Shotōka kokugo$", |
|
243 |
"^Shōgaku dokuhon$", |
|
244 |
"^Jinjō shōgaku kokugo dokuhon$", |
|
245 |
"^Shinsei kokugo dokuhon$", |
|
246 |
"^Teikoku dokuhon$", |
|
247 |
"^Instructions to Authors$", |
|
248 |
"^KİTAP TAHLİLİ$", |
|
249 |
"^PRZEGLĄD PIŚMIENNICTWA$", |
|
250 |
"(?i)^Presentación$", |
|
251 |
"^İçindekiler$", |
|
252 |
"(?i)^Tabl?e of contents$", |
|
253 |
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", |
|
254 |
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", |
|
255 |
"^Editorial( Board)?$", |
|
256 |
"(?i)^Editorial \\(English\\)$", |
|
257 |
"^Editörden$", |
|
258 |
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$", |
|
259 |
"^(Kiri Karl Morgensternile).*$", |
|
260 |
"^(\\[Eksliibris Aleksandr).*\\]$", |
|
261 |
"^(\\[Eksliibris Aleksandr).*$", |
|
262 |
"^(Eksliibris Aleksandr).*$", |
|
263 |
"^(Kiri A\\. de Vignolles).*$", |
|
264 |
"^(2 kirja Karl Morgensternile).*$", |
|
265 |
"^(Pirita kloostri idaosa arheoloogilised).*$", |
|
266 |
"^(Kiri tundmatule).*$", |
|
267 |
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", |
|
268 |
"^(Eksliibris Nikolai Birukovile).*$", |
|
269 |
"^(Eksliibris Nikolai Issakovile).*$", |
|
270 |
"^(WHP Cruise Summary Information of section).*$", |
|
271 |
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", |
|
272 |
"^(Measurement of the spin\\-dependent structure function).*" |
|
273 |
] |
|
274 |
} |
|
275 |
} |
|
276 |
} |
|
277 |
</DEDUPLICATION> |
|
278 |
</CONFIGURATION> |
|
279 |
<STATUS> |
|
280 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
281 |
</STATUS> |
|
282 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
283 |
</BODY> |
|
284 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupDeleteDedupRelsJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="5626c94e-0005-416a-9ea4-48fc8af85ecd_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="deleteDedupRelsJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that deletes the dedup rels used to in the deduplication process</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupDeleteRelMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.Writable" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
25 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
28 |
|
|
29 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
30 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
31 |
|
|
32 |
<!-- Uncomment to override the default lib path --> |
|
33 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
34 |
</STATIC_CONFIGURATION> |
|
35 |
<JOB_INTERFACE> |
|
36 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
37 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
38 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
40 |
</JOB_INTERFACE> |
|
41 |
<SCAN> |
|
42 |
<FILTERS operator="MUST_PASS_ALL"> |
|
43 |
<FILTER type="prefix" param="entityTypeId" /> |
|
44 |
</FILTERS> |
|
45 |
<FAMILIES> |
|
46 |
<FAMILY param="entityType" /> |
|
47 |
<FAMILY value="resultResult_dedup_merges" /> |
|
48 |
<FAMILY value="resultResult_dedup_isMergedIn" /> |
|
49 |
<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" /> |
|
50 |
|
|
51 |
<FAMILY value="personPerson_dedup_merges" /> |
|
52 |
<FAMILY value="personPerson_dedup_isMergedIn" /> |
|
53 |
<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" /> |
|
54 |
|
|
55 |
<FAMILY value="organizationOrganization_dedup_merges" /> |
|
56 |
<FAMILY value="organizationOrganization_dedup_isMergedIn" /> |
|
57 |
<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" /> |
|
58 |
</FAMILIES> |
|
59 |
</SCAN> |
|
60 |
</HADOOP_JOB> |
|
61 |
<STATUS> |
|
62 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
63 |
<RUNNING_INSTANCES value="0"/> |
|
64 |
<CUMULATIVE_RUN value="0" /> |
|
65 |
</STATUS> |
|
66 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
67 |
</BODY> |
|
68 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/person.step.02.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="82b1c7fb-c36c-4291-8863-0393c7c588ee_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<DESCRIPTION>1 - Person: Decision tree</DESCRIPTION> |
|
12 |
<DEDUPLICATION> |
|
13 |
{ |
|
14 |
"wf" : { |
|
15 |
"threshold" : "1.0", |
|
16 |
"dedupRun" : "001", |
|
17 |
"entityType" : "person", |
|
18 |
"orderField" : "fullname", |
|
19 |
"queueMaxSize" : "2000", |
|
20 |
"groupMaxSize" : "10", |
|
21 |
"slidingWindowSize" : "200", |
|
22 |
"rootBuilder" : [ "person" ], |
|
23 |
"includeChildren" : "true" |
|
24 |
}, |
|
25 |
"pace" : { |
|
26 |
"clustering" : [ |
|
27 |
{ "name" : "personclustering", "fields" : [ "person" ], "params" : { } } |
|
28 |
], |
|
29 |
"model" : [ |
|
30 |
{ "name" : "fullname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "false", "path" : "person/metadata/fullname/value", "params" : { } }, |
|
31 |
{ "name" : "person", "algo" : "PersonDistance", "type" : "JSON", "weight" : "0.7", "ignoreMissing" : "false", "path" : "person", "params" : { "common.anchors" : "1", "common.surnames" : "3" } }, |
|
32 |
{ "name" : "lastname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "person/metadata/secondnames/value" } |
|
33 |
], |
|
34 |
"blacklists" : { |
|
35 |
"lastname" : [ |
|
36 |
"(?i)^wang$", |
|
37 |
"(?i)^~wang$", |
|
38 |
"(?i)^zhang$", |
|
39 |
"(?i)^zhou$", |
|
40 |
"(?i)^zhao$", |
|
41 |
"(?i)^li$", |
|
42 |
"(?i)^~li$", |
|
43 |
"(?i)^liu$", |
|
44 |
"(?i)^chen$", |
|
45 |
"(?i)^yang$", |
|
46 |
"(?i)^kim$", |
|
47 |
"(?i)^xu$", |
|
48 |
"(?i)^huang$", |
|
49 |
"(?i)^sun$", |
|
50 |
"(?i)^lee$", |
|
51 |
"(?i)^ma$", |
|
52 |
"(?i)^kim$", |
|
53 |
"(?i)^hu$", |
|
54 |
"(?i)^wu$", |
|
55 |
"(?i)^zhu$", |
|
56 |
"(?i)^lu$" |
|
57 |
] |
|
58 |
} |
|
59 |
} |
|
60 |
} |
|
61 |
</DEDUPLICATION> |
|
62 |
</CONFIGURATION> |
|
63 |
<STATUS> |
|
64 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
65 |
</STATUS> |
|
66 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
67 |
</BODY> |
|
68 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/tags/dnet-openaireplus-profiles-1.0.11/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/personCsvJoinJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="3f544a36-f123-4f5c-acf4-7c25f6591ec4_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="personCsvJoinJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that joins person entities by "surname+first name letter" and serialise the output as csv</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat" /> |
|
17 |
<PROPERTY key="mapreduce.output.lazyoutputformat.outputformat" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
18 |
|
|
19 |
<!-- MAPPER --> |
|
20 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.experiment.JoinPersonGroupMapper" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
22 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" /> |
|
23 |
|
|
24 |
<!-- REDUCER --> |
|
25 |
<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.experiment.JoinPersonGroupReducer" /> |
|
26 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" /> |
|
27 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" /> |
|
28 |
|
|
29 |
<!-- MISC --> |
|
30 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
31 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
32 |
<PROPERTY key="mapred.reduce.tasks" value="10" /> |
|
33 |
|
|
34 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
35 |
|
|
36 |
<!-- Uncomment to override the default lib path --> |
|
37 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
38 |
|
|
39 |
</STATIC_CONFIGURATION> |
|
40 |
<JOB_INTERFACE> |
|
41 |
<PARAM name="mapred.input.dir" required="true" description="input sequence file" /> |
|
42 |
</JOB_INTERFACE> |
|
43 |
</HADOOP_JOB> |
|
44 |
<STATUS> |
|
45 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
46 |
<RUNNING_INSTANCES value="0"/> |
|
47 |
<CUMULATIVE_RUN value="0" /> |
|
48 |
</STATUS> |
|
49 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
50 |
</BODY> |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-openaireplus-profiles-1.0.11