Revision 36726
Added by Claudio Atzori almost 9 years ago
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/5f52f22e-b077-43ac-bf22-83de1543c9e1.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="5f52f22e-b077-43ac-bf22-83de1543c9e1_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2015-01-18T11:37:10+00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION id="00"> |
|
11 |
<DESCRIPTION>00 - Generic configuration</DESCRIPTION> |
|
12 |
<DEDUPLICATION> |
|
13 |
<ENTITY name="result"> |
|
14 |
<PACE>pace.conf { |
|
15 |
clustering { |
|
16 |
ngrampairs { fields = [title], params = { max = 1, ngramLen = 3} }, |
|
17 |
suffixprefix { fields = [title], params = { max = 1, len = 3 } } }, |
|
18 |
conditions { |
|
19 |
titleVersionMatch { fields = [title] } }, |
|
20 |
model { |
|
21 |
title { algo = JaroWinkler, type = String, weight = 1.0, ignoreMissing = false, path = result/metadata/title/value } }, |
|
22 |
blacklists = { title = [ |
|
23 |
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$", |
|
24 |
"^(Kiri Karl Morgensternile).*$", |
|
25 |
"^(\\[Eksliibris Aleksandr).*\\]$", |
|
26 |
"^(\\[Eksliibris Aleksandr).*$", |
|
27 |
"^(Kiri A\\. de Vignolles).*$", |
|
28 |
"^(2 kirja Karl Morgensternile).*$", |
|
29 |
"^(Pirita kloostri idaosa arheoloogilised).*$", |
|
30 |
"^(Kiri tundmatule).*$", |
|
31 |
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", |
|
32 |
"^(Eksliibris Nikolai Birukovile).*$", |
|
33 |
"^(Eksliibris Nikolai Issakovile).*$", |
|
34 |
"^(WHP Cruise Summary Information of section).*$", |
|
35 |
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", |
|
36 |
"^(Measurement of the spin\\-dependent structure function).*" |
|
37 |
]}}</PACE> |
|
38 |
<WORKFLOW>dedup.conf { |
|
39 |
threshold = 0.99, |
|
40 |
run = '001', |
|
41 |
entity.type = result, |
|
42 |
order.field = title, |
|
43 |
queue.max.size = 2000, |
|
44 |
group.max.size = 10, |
|
45 |
sliding.window.size = 200, |
|
46 |
rootbuilder = [result,personResult_authorship_hasAuthor,resultProject_outcome_isProducedBy,resultResult_publicationDataset_isRelatedTo,resultResult_similarity_isAmongTopNSimilarDocuments,resultResult_similarity_hasAmongTopNSimilarDocuments] }</WORKFLOW> |
|
47 |
</ENTITY> |
|
48 |
<ENTITY name="person"> |
|
49 |
<PACE>pace.conf { |
|
50 |
clustering { \ |
|
51 |
ngrampairs { fields = [fullname], params = { max = 1, ngramLen = 3} }, |
|
52 |
suffixprefix { fields = [fullname], params = { max = 1, len = 3 } } }, |
|
53 |
model { \ |
|
54 |
fullname { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false, path = person/metadata/fullname/value }, |
|
55 |
coauthors { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true, path = person/metadata/coauthors/value } } }</PACE> |
|
56 |
<WORKFLOW>dedup.conf { |
|
57 |
threshold = 0.99, |
|
58 |
run = '001', |
|
59 |
entity.type = person, |
|
60 |
queue.max.size = 2000, |
|
61 |
group.max.size = 10, |
|
62 |
sliding.window.size = 200, |
|
63 |
order.field = fullname, rootbuilder = [person,personResult_authorship_isAuthorOf,projectPerson_contactPerson_isContact] }</WORKFLOW> |
|
64 |
</ENTITY> |
|
65 |
<ENTITY name="organization"> |
|
66 |
<PACE>pace.conf { |
|
67 |
clustering { |
|
68 |
ngrampairs { fields = [legalname], params = { max = 1, ngramLen = 3} }, |
|
69 |
suffixprefix { fields = [legalname], params = { max = 1, len = 3 } } }, |
|
70 |
model { |
|
71 |
legalname { algo = JaroWinkler, type = String, weight = 0.6, ignoreMissing = false, path = organization/metadata/legalname/value }, |
|
72 |
legalshortname { algo = JaroWinkler, type = String, weight = 0.4, ignoreMissing = true, path = organization/metadata/legalname/value } } }</PACE> |
|
73 |
<WORKFLOW>dedup.conf { |
|
74 |
threshold = 0.99, |
|
75 |
run = '001', |
|
76 |
entity.type = organization, |
|
77 |
order.field = legalname, |
|
78 |
queue.max.size = 2000, |
|
79 |
group.max.size = 10, |
|
80 |
sliding.window.size = 200, |
|
81 |
rootbuilder = [organization,projectOrganization_participation_isParticipant,datasourceOrganization_provision_isProvidedBy] }</WORKFLOW> |
|
82 |
</ENTITY> |
|
83 |
</DEDUPLICATION> |
|
84 |
</CONFIGURATION> |
|
85 |
<STATUS> |
|
86 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
87 |
</STATUS> |
|
88 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
89 |
</BODY> |
|
90 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/offlineHbaseLoadJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="073e55eb-c6f4-49a9-80b3-1a927612ba5b_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="offlineHbaseLoad" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that loads a given entity type in the offline dedup table</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.OfflineHbaseLoadMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.compress.map.output" value="true" /> |
|
25 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
28 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
29 |
|
|
30 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
31 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
32 |
|
|
33 |
<!-- Uncomment to override the default lib path --> |
|
34 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
35 |
</STATIC_CONFIGURATION> |
|
36 |
<JOB_INTERFACE> |
|
37 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
38 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
40 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
41 |
</JOB_INTERFACE> |
|
42 |
<SCAN> |
|
43 |
<FILTERS operator="MUST_PASS_ALL"> |
|
44 |
<FILTER type="prefix" param="entityTypeId" /> |
|
45 |
</FILTERS> |
|
46 |
<FAMILIES> |
|
47 |
<FAMILY param="entityType" /> |
|
48 |
</FAMILIES> |
|
49 |
</SCAN> |
|
50 |
</HADOOP_JOB> |
|
51 |
<STATUS> |
|
52 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
53 |
<RUNNING_INSTANCES value="0"/> |
|
54 |
<CUMULATIVE_RUN value="0" /> |
|
55 |
</STATUS> |
|
56 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
57 |
</BODY> |
|
58 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupCandidateScanJob.xml | ||
---|---|---|
43 | 43 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
44 | 44 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
45 | 45 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
46 |
<PARAM name="dedup.pace.conf" required="true" description="dedup pace configuration" /> |
|
47 |
<PARAM name="dedup.wf.conf" required="true" description="dedup workflow configuration" /> |
|
46 |
<PARAM name="dedup.conf" required="true" description="dedup configuration" /> |
|
48 | 47 |
</JOB_INTERFACE> |
49 | 48 |
<SCAN> |
50 | 49 |
<FILTERS operator="MUST_PASS_ALL"> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupRootsToCSVJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="027554bd-3d5c-4c50-9170-90d8c4402bc3_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupRootsToCSVJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map only job that exports the representatives as CSV files</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat " /> |
|
17 |
<PROPERTY key="mapreduce.output.lazyoutputformat.outputformat" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
18 |
|
|
19 |
<!-- MAPPER --> |
|
20 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvMapper" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
23 |
|
|
24 |
<!-- REDUCER --> |
|
25 |
<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupRootsToCsvReducer" /> |
|
26 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" /> |
|
27 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text" /> |
|
28 |
|
|
29 |
<!-- MUTIPLE OUTPUT --> |
|
30 |
<PROPERTY key="mapreduce.multipleoutputs" value="NativeGroups Groups NativeEntities" /> |
|
31 |
|
|
32 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
33 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.key" value="org.apache.hadoop.io.Text" /> |
|
34 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeGroups.value" value="org.apache.hadoop.io.Text" /> |
|
35 |
|
|
36 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
37 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.key" value="org.apache.hadoop.io.Text" /> |
|
38 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.Groups.value" value="org.apache.hadoop.io.Text" /> |
|
39 |
|
|
40 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.format" value="org.apache.hadoop.mapreduce.lib.output.TextOutputFormat" /> |
|
41 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.key" value="org.apache.hadoop.io.Text" /> |
|
42 |
<PROPERTY key="mapreduce.multipleoutputs.namedOutput.NativeEntities.value" value="org.apache.hadoop.io.Text" /> |
|
43 |
|
|
44 |
<!-- MISC --> |
|
45 |
|
|
46 |
<PROPERTY key="mapred.textoutputformat.wrapper" value="#"/> |
|
47 |
<PROPERTY key="mapred.textoutputformat.separator" value="!"/> |
|
48 |
|
|
49 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
50 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
51 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
52 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
53 |
|
|
54 |
<PROPERTY key="mapred.reduce.tasks" value="3" /> |
|
55 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
56 |
|
|
57 |
<!-- Uncomment to override the default lib path --> |
|
58 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
59 |
</STATIC_CONFIGURATION> |
|
60 |
<JOB_INTERFACE> |
|
61 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
62 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
63 |
|
|
64 |
<PARAM name="mapred.output.dir" required="true" description="target sequence file on hdfs" /> |
|
65 |
</JOB_INTERFACE> |
|
66 |
<SCAN> |
|
67 |
<FILTERS operator="MUST_PASS_ALL"> |
|
68 |
<FILTER type="prefix" param="entityTypeId" /> |
|
69 |
</FILTERS> |
|
70 |
<FAMILIES> |
|
71 |
<FAMILY param="entityType" /> |
|
72 |
<FAMILY value="resultResult_dedup_merges" /> |
|
73 |
<FAMILY value="personPerson_dedup_merges" /> |
|
74 |
<FAMILY value="organizationOrganization_dedup_merges" /> |
|
75 |
</FAMILIES> |
|
76 |
</SCAN> |
|
77 |
</HADOOP_JOB> |
|
78 |
<STATUS> |
|
79 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
80 |
<RUNNING_INSTANCES value="0"/> |
|
81 |
<CUMULATIVE_RUN value="0" /> |
|
82 |
</STATUS> |
|
83 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
84 |
</BODY> |
|
85 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupBuildRootsJob.xml | ||
---|---|---|
14 | 14 |
<!-- I/O FORMAT --> |
15 | 15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
16 | 16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
17 |
|
|
17 |
|
|
18 | 18 |
<!-- MAPPER --> |
19 | 19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupBuildRootsMapper" /> |
20 | 20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
21 | 21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
22 |
|
|
22 |
|
|
23 | 23 |
<!-- REDUCER --> |
24 | 24 |
<PROPERTY key="mapreduce.reduce.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupBuildRootsReducer" /> |
25 | 25 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
26 | 26 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Writable" /> |
27 |
|
|
27 |
|
|
28 | 28 |
<!-- MISC --> |
29 | 29 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
30 | 30 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupSimilarity2ActionsJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="5c4b4dbf-8198-4f7a-9a35-367c7b0a7391_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupSimilarity2ActionsJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that scans a given entity type and creates the similarRel graph</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.dedup.DedupSimilarityToActionsMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.io.ImmutableBytesWritable" /> |
|
22 |
|
|
23 |
<!-- MISC --> |
|
24 |
<PROPERTY key="mapred.compress.map.output" value="true" /> |
|
25 |
<PROPERTY key="mapred.reduce.tasks.speculative.execution" value="false" /> |
|
26 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
27 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
28 |
<PROPERTY key="mapreduce.reduce.speculative" value="false" /> |
|
29 |
|
|
30 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
31 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
32 |
|
|
33 |
<!-- Uncomment to override the default lib path --> |
|
34 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
35 |
</STATIC_CONFIGURATION> |
|
36 |
<JOB_INTERFACE> |
|
37 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
38 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
40 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
41 |
<PARAM name="dedup.conf" required="true" description="dedup configuration" /> |
|
42 |
<PARAM name="rawSetId" required="true" description="raw set identifier" /> |
|
43 |
<PARAM name="similarityCF" required="true" description="similarity column family name" /> |
|
44 |
</JOB_INTERFACE> |
|
45 |
<SCAN> |
|
46 |
<FILTERS operator="MUST_PASS_ALL"> |
|
47 |
<FILTER type="prefix" param="entityTypeId" /> |
|
48 |
</FILTERS> |
|
49 |
<FAMILIES> |
|
50 |
<FAMILY param="entityType" /> |
|
51 |
<FAMILY value="resultResult_dedupSimilarity_isSimilarTo" /> |
|
52 |
<FAMILY value="personPerson_dedupSimilarity_isSimilarTo" /> |
|
53 |
<FAMILY value="organizationOrganization_dedupSimilarity_isSimilarTo" /> |
|
54 |
</FAMILIES> |
|
55 |
</SCAN> |
|
56 |
</HADOOP_JOB> |
|
57 |
<STATUS> |
|
58 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
59 |
<RUNNING_INSTANCES value="0"/> |
|
60 |
<CUMULATIVE_RUN value="0" /> |
|
61 |
</STATUS> |
|
62 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
63 |
</BODY> |
|
64 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/promoteActions.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER |
|
4 |
value="8bb6c559-edf3-4da1-87d7-cdee4fba21dd_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ==" /> |
|
5 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType" /> |
|
6 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources" /> |
|
7 |
<RESOURCE_URI value="" /> |
|
8 |
<DATE_OF_CREATION value="2001-12-31T12:00:00" /> |
|
9 |
</HEADER> |
|
10 |
<BODY> |
|
11 |
<HADOOP_JOB name="promoteActionsJob" type="mapreduce"> |
|
12 |
<DESCRIPTION>map only job that promote actions from a set to the data table</DESCRIPTION> |
|
13 |
<STATIC_CONFIGURATION> |
|
14 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.actions2.PromoteActionsMapper" /> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableOutputFormat" /> |
|
17 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
18 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.hbase.client.Put" /> |
|
19 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
20 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
21 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
22 |
<!-- <PROPERTY key="job.lib" value="/tmp/dnet-mapreduce-jobs-0.0.3-SNAPSHOT-jar-with-dependencies.jar" /> --> |
|
23 |
</STATIC_CONFIGURATION> |
|
24 |
|
|
25 |
<JOB_INTERFACE> |
|
26 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
27 |
<PARAM name="hbase.mapred.outputtable" required="true" description="target hbase table" /> |
|
28 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
29 |
<PARAM name="hbase.mapreduce.outputtable" required="true" description="target hbase table" /> |
|
30 |
<PARAM name="latestRawSets" required="true" description="the latest action raw sets" /> |
|
31 |
</JOB_INTERFACE> |
|
32 |
|
|
33 |
<SCAN> |
|
34 |
<FILTERS operator="MUST_PASS_ALL"> |
|
35 |
<FILTER type="prefix" value="aac|" /> |
|
36 |
</FILTERS> |
|
37 |
<FAMILIES /> |
|
38 |
</SCAN> |
|
39 |
|
|
40 |
</HADOOP_JOB> |
|
41 |
|
|
42 |
<STATUS> |
|
43 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00" /> |
|
44 |
<RUNNING_INSTANCES value="0" /> |
|
45 |
<CUMULATIVE_RUN value="0" /> |
|
46 |
</STATUS> |
|
47 |
|
|
48 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
49 |
</BODY> |
|
50 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/HadoopJobConfigurationDSResources/HadoopJobConfigurationDSResourceType/dedupIndexFeedJob.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="fa0e188c-aec4-4877-93b4-43e3c5acae1d_SGFkb29wSm9iQ29uZmlndXJhdGlvbkRTUmVzb3VyY2VzL0hhZG9vcEpvYkNvbmZpZ3VyYXRpb25EU1Jlc291cmNlVHlwZQ=="/> |
|
4 |
<RESOURCE_TYPE value="HadoopJobConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="HadoopJobConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<HADOOP_JOB name="dedupIndexFeedJob" type="mapreduce"> |
|
11 |
<DESCRIPTION>map reduce job that updates the dedup index</DESCRIPTION> |
|
12 |
<STATIC_CONFIGURATION> |
|
13 |
|
|
14 |
<!-- I/O FORMAT --> |
|
15 |
<PROPERTY key="mapreduce.inputformat.class" value="org.apache.hadoop.hbase.mapreduce.TableInputFormat" /> |
|
16 |
<PROPERTY key="mapreduce.outputformat.class" value="org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat" /> |
|
17 |
|
|
18 |
<!-- MAPPER --> |
|
19 |
<PROPERTY key="mapreduce.map.class" value="eu.dnetlib.data.mapreduce.hbase.index.DedupIndexFeedMapper" /> |
|
20 |
<PROPERTY key="mapred.mapoutput.key.class" value="org.apache.hadoop.io.Text" /> |
|
21 |
<PROPERTY key="mapred.mapoutput.value.class" value="org.apache.hadoop.io.Text" /> |
|
22 |
|
|
23 |
<!-- JOB GLOBAL --> |
|
24 |
<PROPERTY key="mapred.output.key.class" value="org.apache.hadoop.io.Text" /> |
|
25 |
<PROPERTY key="mapred.output.value.class" value="org.apache.hadoop.io.Text"/> |
|
26 |
|
|
27 |
<!-- MISC --> |
|
28 |
<PROPERTY key="mapred.map.tasks.speculative.execution" value="false" /> |
|
29 |
<PROPERTY key="mapreduce.map.speculative" value="false" /> |
|
30 |
<PROPERTY key="mapred.reduce.tasks" value="0" /> |
|
31 |
<PROPERTY key="mapred.fairscheduler.pool" value="solr"/> |
|
32 |
<!-- <PROPERTY key="user.name" value="dnet" /> --> |
|
33 |
|
|
34 |
<!-- Uncomment to override the default lib path --> |
|
35 |
<!-- <PROPERTY key="job.lib" value="/user/dnet/dnet-mapreduce-jobs-0.0.2-SNAPSHOT-jar-with-dependencies.jar"/> --> |
|
36 |
</STATIC_CONFIGURATION> |
|
37 |
<JOB_INTERFACE> |
|
38 |
<PARAM name="hbase.mapred.inputtable" required="true" description="source hbase table" /> |
|
39 |
<PARAM name="hbase.mapreduce.inputtable" required="true" description="source hbase table" /> |
|
40 |
<PARAM name="mapred.output.dir" required="true" description="destination path on hdfs for rotten index xml records" /> |
|
41 |
|
|
42 |
<PARAM name="index.solr.url" required="false" description="url used to instantiate the solr client" /> |
|
43 |
<PARAM name="index.solr.collection" required="true" description="target solr collection to be fed" /> |
|
44 |
|
|
45 |
<PARAM name="id" required="true" description="index DS id" /> |
|
46 |
<PARAM name="index.shutdown.wait.time" required="true" description="wait time before shut down the solr client pool" /> |
|
47 |
<PARAM name="index.buffer.flush.threshold" required="true" description="indexing buffer flush threshold" /> |
|
48 |
<PARAM name="index.feed.timestamp" required="true" description="timestamp used as ds_version" /> |
|
49 |
<PARAM name="index.solr.sim.mode" required="true" description="boolean value, allows to run this job in simulation mode" /> |
|
50 |
<PARAM name="index.fields" required="true" description="fields from a given MDFormatDSResourceType" /> |
|
51 |
<PARAM name="entityType" required="true" description="entity Type name" /> |
|
52 |
</JOB_INTERFACE> |
|
53 |
<SCAN> |
|
54 |
<FILTERS operator="MUST_PASS_ALL"> |
|
55 |
<FILTER type="prefix" param="entityTypeId" /> |
|
56 |
</FILTERS> |
|
57 |
<FAMILIES> |
|
58 |
<FAMILY param="entityType" /> |
|
59 |
</FAMILIES> |
|
60 |
</SCAN> |
|
61 |
</HADOOP_JOB> |
|
62 |
<STATUS> |
|
63 |
<LAST_SUBMISSION_DATE value="2001-12-31T12:00:00"/> |
|
64 |
<RUNNING_INSTANCES value="0"/> |
|
65 |
<CUMULATIVE_RUN value="0" /> |
|
66 |
</STATUS> |
|
67 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
68 |
</BODY> |
|
69 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupOrchestrationDSResources/DedupOrchestrationDSResourceType/organization.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="4b9a58d4-a048-4161-b5b1-a5cdf78a9956_RGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupOrchestrationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<DEDUPLICATION> |
|
12 |
<ENTITY name="organization" code="20" label="Organization" /> |
|
13 |
<ACTION_SET id="dedup-similarity-organization" /> |
|
14 |
<SCAN_SEQUENCE> |
|
15 |
<SCAN id="98494a63-f5d1-46f7-9afd-e026c1dda913_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
16 |
<SCAN id="1d52dba7-1902-4c25-bf5b-3598f29ef11c_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
17 |
</SCAN_SEQUENCE> |
|
18 |
</DEDUPLICATION> |
|
19 |
</CONFIGURATION> |
|
20 |
<STATUS> |
|
21 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
22 |
</STATUS> |
|
23 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
24 |
</BODY> |
|
25 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupOrchestrationDSResources/DedupOrchestrationDSResourceType/result.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="acaf1433-8a35-4708-b903-ab35c899851d_RGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBPcmNoZXN0cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupOrchestrationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<DEDUPLICATION> |
|
12 |
<ENTITY name="result" code="50" label="Publication" /> |
|
13 |
<ACTION_SET id="dedup-similarity-result" /> |
|
14 |
<SCAN_SEQUENCE> |
|
15 |
<SCAN id="c611ec67-eefc-4ffe-a5d4-cb3fc40a8baf_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
16 |
</SCAN_SEQUENCE> |
|
17 |
</DEDUPLICATION> |
|
18 |
</CONFIGURATION> |
|
19 |
<STATUS> |
|
20 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
21 |
</STATUS> |
|
22 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
23 |
</BODY> |
|
24 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.step.01.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8baf_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION id="01"> |
|
11 |
<DEDUPLICATION> |
|
12 |
{ |
|
13 |
"wf" : { |
|
14 |
"threshold" : "0.99", |
|
15 |
"dedupRun" : "001", |
|
16 |
"entityType" : "result", |
|
17 |
"orderField" : "title", |
|
18 |
"queueMaxSize" : "2000", |
|
19 |
"groupMaxSize" : "10", |
|
20 |
"slidingWindowSize" : "200", |
|
21 |
"rootBuilder" : [ "result,personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ], |
|
22 |
"includeChildren" : "true" |
|
23 |
}, |
|
24 |
"pace" : { |
|
25 |
"clustering" : [ |
|
26 |
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, |
|
27 |
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } |
|
28 |
], |
|
29 |
"conditions" : [ |
|
30 |
{ "name" : "titleVersionMatch", "fields" : [ "title" ] } |
|
31 |
], |
|
32 |
"model" : [ |
|
33 |
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.8", "ignoreMissing" : "false", "path" : "result/metadata/title/value" } |
|
34 |
], |
|
35 |
"blacklists" : { |
|
36 |
"title" : [ |
|
37 |
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$", |
|
38 |
"^(Kiri Karl Morgensternile).*$", |
|
39 |
"^(\\[Eksliibris Aleksandr).*\\]$", |
|
40 |
"^(\\[Eksliibris Aleksandr).*$", |
|
41 |
"^(Eksliibris Aleksandr).*$", |
|
42 |
"^(Kiri A\\. de Vignolles).*$", |
|
43 |
"^(2 kirja Karl Morgensternile).*$", |
|
44 |
"^(Pirita kloostri idaosa arheoloogilised).*$", |
|
45 |
"^(Kiri tundmatule).*$", |
|
46 |
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", |
|
47 |
"^(Eksliibris Nikolai Birukovile).*$", |
|
48 |
"^(Eksliibris Nikolai Issakovile).*$", |
|
49 |
"^(WHP Cruise Summary Information of section).*$", |
|
50 |
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", |
|
51 |
"^(Measurement of the spin\\-dependent structure function).*" |
|
52 |
] |
|
53 |
} |
|
54 |
} |
|
55 |
} |
|
56 |
</DEDUPLICATION> |
|
57 |
</CONFIGURATION> |
|
58 |
<STATUS> |
|
59 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
60 |
</STATUS> |
|
61 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
62 |
</BODY> |
|
63 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/organization.step.01.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="98494a63-f5d1-46f7-9afd-e026c1dda913_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<DESCRIPTION>1 - Organization: Merge by legalshortname</DESCRIPTION> |
|
12 |
<DEDUPLICATION> |
|
13 |
{ |
|
14 |
"wf" : { |
|
15 |
"threshold" : "1.0", |
|
16 |
"dedupRun" : "001", |
|
17 |
"entityType" : "organization", |
|
18 |
"orderField" : "legalshortname", |
|
19 |
"queueMaxSize" : "2000", |
|
20 |
"groupMaxSize" : "10", |
|
21 |
"slidingWindowSize" : "200", |
|
22 |
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], |
|
23 |
"includeChildren" : "true" |
|
24 |
}, |
|
25 |
"pace" : { |
|
26 |
"clustering" : [ |
|
27 |
{ "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } } |
|
28 |
], |
|
29 |
"model" : [ |
|
30 |
{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" } |
|
31 |
], |
|
32 |
"blacklists" : { } |
|
33 |
} |
|
34 |
} |
|
35 |
</DEDUPLICATION> |
|
36 |
</CONFIGURATION> |
|
37 |
<STATUS> |
|
38 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
39 |
</STATUS> |
|
40 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
41 |
</BODY> |
|
42 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/organization.step.02.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="1d52dba7-1902-4c25-bf5b-3598f29ef11c_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/> |
|
4 |
<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="DedupConfigurationDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2001-12-31T12:00:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<DESCRIPTION>2 - Organization: Match against the legalname</DESCRIPTION> |
|
12 |
<DEDUPLICATION> |
|
13 |
{ |
|
14 |
"wf" : { |
|
15 |
"threshold" : "0.95", |
|
16 |
"dedupRun" : "001", |
|
17 |
"entityType" : "organization", |
|
18 |
"orderField" : "legalname", |
|
19 |
"queueMaxSize" : "2000", |
|
20 |
"groupMaxSize" : "10", |
|
21 |
"slidingWindowSize" : "200", |
|
22 |
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], |
|
23 |
"includeChildren" : "true" |
|
24 |
}, |
|
25 |
"pace" : { |
|
26 |
"clustering" : [ |
|
27 |
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, |
|
28 |
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } |
|
29 |
], |
|
30 |
"model" : [ |
|
31 |
{ "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" } |
|
32 |
], |
|
33 |
"blacklists" : { } |
|
34 |
} |
|
35 |
} |
|
36 |
</DEDUPLICATION> |
|
37 |
</CONFIGURATION> |
|
38 |
<STATUS> |
|
39 |
<LAST_UPDATE value="2001-12-31T12:00:00"/> |
|
40 |
</STATUS> |
|
41 |
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS> |
|
42 |
</BODY> |
|
43 |
</RESOURCE_PROFILE> |
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/MDFormatDSResources/MDFormatDSResourceType/919cd504-6a4b-4de1-a33c-ad26ece21350.xml | ||
---|---|---|
1 |
<RESOURCE_PROFILE> |
|
2 |
<HEADER> |
|
3 |
<RESOURCE_IDENTIFIER value="919cd504-6a4b-4de1-a33c-ad26ece21350_TURGb3JtYXREU1Jlc291cmNlcy9NREZvcm1hdERTUmVzb3VyY2VUeXBl"/> |
|
4 |
<RESOURCE_TYPE value="MDFormatDSResourceType"/> |
|
5 |
<RESOURCE_KIND value="MDFormatDSResources"/> |
|
6 |
<RESOURCE_URI value=""/> |
|
7 |
<DATE_OF_CREATION value="2008-05-22T14:40:04+02:00"/> |
|
8 |
</HEADER> |
|
9 |
<BODY> |
|
10 |
<CONFIGURATION> |
|
11 |
<NAME>OPENAIRE</NAME> |
|
12 |
<DESCRIPTION>Openaire Metadata Format</DESCRIPTION> |
|
13 |
<INTERPRETATION>dedup</INTERPRETATION> |
|
14 |
<SCHEMA uri=""/> |
|
15 |
</CONFIGURATION> |
|
16 |
<STATUS> |
|
17 |
<LAYOUTS> |
|
18 |
<LAYOUT name="index"> |
|
19 |
<FIELDS> |
|
20 |
<!-- COMMON FIELDS --> |
|
21 |
<FIELD name="oaftype" path="entity/type" tokenizable="false" indexable="true" stat="false" result="false"/> |
|
22 |
<FIELD name="objIdentifier" path="entity/id" tokenizable="false" indexable="true" stat="false" result="false"/> |
|
23 |
<FIELD name="collectedfrom" path="entity/collectedfrom/value" tokenizable="false" indexable="true" result="false" stat="false" /> |
|
24 |
<FIELD name="pid" path="entity/pid/value" tokenizable="false" stat="false" result="false" indexable="true"/> |
|
25 |
<FIELD name="deletedbyinference" path="dataInfo/deletedbyinference" tokenizable="false" stat="false" result="false" indexable="true"/> |
|
26 |
<FIELD name="inferred" path="dataInfo/inferred" tokenizable="false" stat="false" result="false" indexable="true"/> |
|
27 |
<FIELD name="actionset" indexable="true" stat="false" tokenizable="false" result="false"/> |
|
28 |
|
|
29 |
<!-- ORGANIZATION FIELDS --> |
|
30 |
<FIELD name="organizationlegalname" path="entity/organization/metadata/legalname/value | entity/children/organization/metadata/legalname/value" type="ngramtext" stat="false" indexable="true" result="false"/> |
|
31 |
<FIELD name="organizationlegalshortname" path="entity/organization/metadata/legalshortname/value | entity/children/organization/metadata/legalshortname/value" type="ngramtext" stat="false" indexable="true" result="false"/> |
|
32 |
<FIELD name="organizationwebsiteurl" path="entity/organization/metadata/websiteurl/value" stat="false" indexable="true" result="false"/> |
|
33 |
|
|
34 |
<!-- PERSON FIELDS --> |
|
35 |
<FIELD name="personfirstname" path="entity/person/metadata/firstname/value" stat="false" indexable="true" result="false"/> |
|
36 |
<FIELD name="personsecondnames" path="entity/person/metadata/secondnames/value" stat="false" indexable="true" result="false"/> |
|
37 |
<FIELD name="personfullname" path="entity/person/metadata/fullname/value" stat="false" indexable="true" result="false"/> |
|
38 |
|
|
39 |
<!-- RESULT FIELDS --> |
|
40 |
<FIELD name="resulttitle" path="entity/result/metadata/title/value | entity/children/result/metadata/title/value" stat="false" result="false" indexable="true"/> |
|
41 |
<FIELD name="resultdescription" path="entity/result/metadata/description/value | entity/children/result/metadata/description/value" result="false" indexable="true" stat="false"/> |
|
42 |
|
|
43 |
</FIELDS> |
|
44 |
</LAYOUT> |
|
45 |
</LAYOUTS> |
|
46 |
</STATUS> |
|
47 |
</BODY> |
|
48 |
</RESOURCE_PROFILE> |
Also available in: Unified diff
merged branch dedupConf