Project

General

Profile

« Previous | Next » 

Revision 52937

added support for sub entities

View differences:

modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.step.01.xml
1
<RESOURCE_PROFILE>
2
    <HEADER>
3
        <RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8bag_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
        <RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
        <RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
        <RESOURCE_URI value=""/>
7
        <DATE_OF_CREATION value="2001-12-31T12:00:00"/>
8
    </HEADER>
9
    <BODY>
10
        <CONFIGURATION>
11
	        <DESCRIPTION>1 - Publication: Match against the title, whose numbers must match</DESCRIPTION>
12
            <DEDUPLICATION>
13
    {
14
        "wf" : {
15
            "threshold" : "0.99",
16
		    "dedupRun" : "001",
17
		    "entityType" : "result",
18
		    "orderField" : "title",
19
		    "queueMaxSize" : "4000",
20
		    "groupMaxSize" : "40",
21
		    "slidingWindowSize" : "200",
22
		    "rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
23
		    "includeChildren" : "true",
24
		    "maxChildren" : "40"
25
        },
26
        "pace" : {
27
        "clustering" : [
28
            { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
29
            { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
30
            { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
31
        ],
32
        "strictConditions" : [
33
            { "name" : "pidMatch", "fields" : [ "pid" ] }
34
        ],
35
        "conditions" : [
36
		    { "name" : "titleVersionMatch", "fields" : [ "title" ] },
37
		    { "name" : "sizeMatch", "fields" : [ "authors" ] }
38
	    ],
39
	    "model" : [
40
            { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
41
            { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
42
		    { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
43
		    { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }
44
	    ],
45
	    "blacklists" : {
46
		    "title" : [
47
	            "^Inside Front Cover$",
48
	            "(?i)^Poster presentations$",
49
			    "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
50
			    "^Problems with perinatal pathology\.?$",
51
			    "(?i)^Cases? of Puerperal Convulsions$",
52
			    "(?i)^Operative Gyna?ecology$",
53
			    "(?i)^Mind the gap\!?\:?$",
54
			    "^Chronic fatigue syndrome\.?$",
55
			    "^Cartas? ao editor Letters? to the Editor$",
56
			    "^Note from the Editor$",
57
			    "^Anesthesia Abstract$",
58
			    "^Annual report$",
59
			    "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
60
			    "(?i)^Graph and Table of Infectious Diseases?$",
61
			    "^Presentation$",
62
			    "(?i)^Reviews and Information on Publications$",
63
			    "(?i)^PUBLIC HEALTH SERVICES?$",
64
			    "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
65
			    "(?i)^Adrese autora$",
66
			    "(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
67
			    "(?i)^Acknowledgement to Referees$",
68
			    "(?i)^Behçet's disease\.?$",
69
			    "(?i)^Isolation and identification of restriction endonuclease.*$",
70
			    "(?i)^CEREBROVASCULAR DISEASES?.?$",
71
			    "(?i)^Screening for abdominal aortic aneurysms?\.?$",
72
			    "^Event management$",
73
			    "(?i)^Breakfast and Crohn's disease.*\.?$",
74
			    "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
75
			    "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
76
			    "^Gushi hakubutsugaku$",
77
			    "^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
78
			    "^Intestinal spirocha?etosis$",
79
			    "^Treatment of Rodent Ulcer$",
80
			    "(?i)^\W*Cloud Computing\W*$",
81
			    "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
82
			    "^Free Communications, Poster Presentations: Session [A-F]$",
83
			    "^“The Historical Aspects? of Quackery\.?”$",
84
			    "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
85
			    "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
86
			    "(?i)^Case Report$",
87
			    "^Boletín Informativo$",
88
			    "(?i)^Glioblastoma Multiforme$",
89
			    "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
90
			    "^Zaměstnanecké výhody$",
91
			    "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
92
			    "(?i)^Carotid body tumours?\\.?$",
93
			    "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
94
			    "^Avant-propos$",
95
			    "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
96
			    "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
97
			    "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
98
			    "^Viñetas de Cortázar$",
99
			    "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
100
			    "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
101
			    "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
102
			    "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
103
			    "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
104
			    "^Aus der AGMB$",
105
			    "^Znanstveno-stručni prilozi$",
106
			    "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
107
			    "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
108
			    "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
109
			    "^Finanční analýza podniku$",
110
			    "^Financial analysis( of business)?$",
111
			    "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
112
			    "^Jikken nihon shūshinsho$",
113
			    "(?i)^CORONER('|s)(s|') INQUESTS$",
114
			    "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
115
			    "(?i)^Consultants' contract(s)?$",
116
			    "(?i)^Upute autorima$",
117
			    "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
118
			    "^Joshi shin kokubun$",
119
			    "^Kōtō shōgaku dokuhon nōson'yō$",
120
			    "^Jinjō shōgaku shōka$",
121
			    "^Shōgaku shūjichō$",
122
			    "^Nihon joshi dokuhon$",
123
			    "^Joshi shin dokuhon$",
124
			    "^Chūtō kanbun dokuhon$",
125
			    "^Wabun dokuhon$",
126
			    "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
127
			    "(?i)^cardiac rehabilitation$",
128
			    "(?i)^Analytical summary$",
129
			    "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
130
			    "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
131
			    "^Prikazi i osvrti$",
132
			    "^Rodinný dům s provozovnou$",
133
			    "^Family house with an establishment$",
134
			    "^Shinsei chūtō shin kokugun$",
135
			    "^Pulmonary alveolar proteinosis(\\.?)$",
136
			    "^Shinshū kanbun$",
137
			    "^Viñeta(s?) de Rodríguez$",
138
			    "(?i)^RUBRIKA UREDNIKA$",
139
			    "^A Matching Model of the Academic Publication Market$",
140
			    "^Yōgaku kōyō$",
141
			    "^Internetový marketing$",
142
			    "^Internet marketing$",
143
			    "^Chūtō kokugo dokuhon$",
144
			    "^Kokugo dokuhon$",
145
			    "^Antibiotic Cover for Dental Extraction(s?)$",
146
			    "^Strategie podniku$",
147
			    "^Strategy of an Enterprise$",
148
			    "(?i)^respiratory disease(s?)(\.?)$",
149
			    "^Award(s?) for Gallantry in Civil Defence$",
150
			    "^Podniková kultura$",
151
			    "^Corporate Culture$",
152
			    "^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
153
			    "^Pracovní motivace$",
154
			    "^Work Motivation$",
155
			    "^Kaitei kōtō jogaku dokuhon$",
156
			    "^Konsolidovaná účetní závěrka$",
157
			    "^Consolidated Financial Statements$",
158
			    "(?i)^intracranial tumour(s?)$",
159
			    "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
160
			    "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
161
			    "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
162
			    "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
163
			    "^Úroveň motivačního procesu jako způsobu vedení lidí$",
164
			    "^The level of motivation process as a leadership$",
165
			    "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
166
			    "(?i)^news and events$",
167
			    "(?i)^NOVOSTI I DOGAĐAJI$",
168
			    "^Sansū no gakushū$",
169
			    "^Posouzení informačního systému firmy a návrh změn$",
170
			    "^Information System Assessment and Proposal for ICT Modification$",
171
			    "^Stresové zatížení pracovníků ve vybrané profesi$",
172
			    "^Stress load in a specific job$",
173
			    "^Sunday: Poster Sessions, Pt.*$",
174
			    "^Monday: Poster Sessions, Pt.*$",
175
			    "^Wednesday: Poster Sessions, Pt.*",
176
			    "^Tuesday: Poster Sessions, Pt.*$",
177
			    "^Analýza reklamy$",
178
			    "^Analysis of advertising$",
179
			    "^Shōgaku shūshinsho$",
180
			    "^Shōgaku sansū$",
181
			    "^Shintei joshi kokubun$",
182
			    "^Taishō joshi kokubun dokuhon$",
183
			    "^Joshi kokubun$",
184
			    "^Účetní uzávěrka a účetní závěrka v ČR$",
185
			    "(?i)^The \"?Causes\"? of Cancer$",
186
			    "^Normas para la publicación de artículos$",
187
			    "^Editor('|s)(s|') [Rr]eply$",
188
			    "^Editor(’|s)(s|’) letter$",
189
			    "^Redaktoriaus žodis$",
190
			    "^DISCUSSION ON THE PRECEDING PAPER$",
191
			    "^Kōtō shōgaku shūshinsho jidōyō$",
192
			    "^Shōgaku nihon rekishi$",
193
			    "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
194
			    "^Préface$",
195
			    "^Occupational [Hh]ealth [Ss]ervices.$",
196
			    "^In Memoriam Professor Toshiyuki TAKESHIMA$",
197
			    "^Účetní závěrka ve vybraném podniku.*$",
198
			    "^Financial statements in selected company$",
199
			    "^Abdominal [Aa]ortic [Aa]neurysms.*$",
200
			    "^Pseudomyxoma peritonei$",
201
			    "^Kazalo autora$",
202
			    "(?i)^uvodna riječ$",
203
			    "^Motivace jako způsob vedení lidí$",
204
			    "^Motivation as a leadership$",
205
			    "^Polyfunkční dům$",
206
			    "^Multi\\-funkcional building$",
207
			    "^Podnikatelský plán$",
208
			    "(?i)^Podnikatelský záměr$",
209
			    "(?i)^Business Plan$",
210
			    "^Oceňování nemovitostí$",
211
			    "^Marketingová komunikace$",
212
			    "^Marketing communication$",
213
			    "^Sumario Analítico$",
214
			    "^Riječ uredništva$",
215
			    "^Savjetovanja i priredbe$",
216
			    "^Índice$",
217
			    "^(Starobosanski nadpisi).*$",
218
			    "^Vzdělávání pracovníků v organizaci$",
219
			    "^Staff training in organization$",
220
			    "^(Life Histories of North American Geometridae).*$",
221
			    "^Strategická analýza podniku$",
222
			    "^Strategic Analysis of an Enterprise$",
223
			    "^Sadržaj$",
224
			    "^Upute suradnicima$",
225
			    "^Rodinný dům$",
226
			    "(?i)^Fami(l)?ly house$",
227
			    "^Upute autorima$",
228
			    "^Strategic Analysis$",
229
			    "^Finanční analýza vybraného podniku$",
230
			    "^Finanční analýza$",
231
			    "^Riječ urednika$",
232
			    "(?i)^Content(s?)$",
233
			    "(?i)^Inhalt$",
234
			    "^Jinjō shōgaku shūshinsho jidōyō$",
235
			    "(?i)^Index$",
236
			    "^Chūgaku kokubun kyōkasho$",
237
			    "^Retrato de una mujer$",
238
			    "^Retrato de un hombre$",
239
			    "^Kōtō shōgaku dokuhon$",
240
			    "^Shotōka kokugo$",
241
			    "^Shōgaku dokuhon$",
242
			    "^Jinjō shōgaku kokugo dokuhon$",
243
			    "^Shinsei kokugo dokuhon$",
244
			    "^Teikoku dokuhon$",
245
			    "^Instructions to Authors$",
246
			    "^KİTAP TAHLİLİ$",
247
			    "^PRZEGLĄD PIŚMIENNICTWA$",
248
			    "(?i)^Presentación$",
249
			    "^İçindekiler$",
250
			    "(?i)^Tabl?e of contents$",
251
			    "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
252
			    "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
253
			    "^Editorial( Board)?$",
254
			    "(?i)^Editorial \\(English\\)$",
255
			    "^Editörden$",
256
			    "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
257
			    "^(Kiri Karl Morgensternile).*$",
258
			    "^(\\[Eksliibris Aleksandr).*\\]$",
259
			    "^(\\[Eksliibris Aleksandr).*$",
260
			    "^(Eksliibris Aleksandr).*$",
261
			    "^(Kiri A\\. de Vignolles).*$",
262
			    "^(2 kirja Karl Morgensternile).*$",
263
			    "^(Pirita kloostri idaosa arheoloogilised).*$",
264
			    "^(Kiri tundmatule).*$",
265
			    "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
266
			    "^(Eksliibris Nikolai Birukovile).*$",
267
			    "^(Eksliibris Nikolai Issakovile).*$",
268
			    "^(WHP Cruise Summary Information of section).*$",
269
			    "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
270
			    "^(Measurement of the spin\\-dependent structure function).*",
271
			    "(?i)^.*authors['’′]? reply\.?$",
272
			    "(?i)^.*authors['’′]? response\.?$"
273
            ]
274
	    }
275
        }
276
    }
277
            </DEDUPLICATION>
278
        </CONFIGURATION>
279
        <STATUS>
280
            <LAST_UPDATE value="2001-12-31T12:00:00"/>
281
        </STATUS>
282
        <SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
283
    </BODY>
284
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.dataset.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8baa_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
		<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
		<RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2018-07-25T15:04:07+00:00"/>
8
	</HEADER>
9
	<BODY>
10
		<CONFIGURATION>
11
			<DESCRIPTION>2 - Dataset: group by DOI and match against the title, when PIDs are not available</DESCRIPTION>
12
			<DEDUPLICATION>
13
{
14
	"wf" : {
15
		"threshold" : "0.99",
16
		"dedupRun" : "001",
17
		"entityType" : "result",
18
		"subEntityType" : "resulttype",
19
		"subEntityValue" : "dataset",
20
		"orderField" : "title",
21
		"queueMaxSize" : "2000",
22
		"groupMaxSize" : "10",
23
		"slidingWindowSize" : "200",
24
		"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith" ],
25
		"includeChildren" : "true"
26
	},
27
	"pace" : {
28
		"clustering" : [
29
			{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
30
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
31
			{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
32
		],
33
		"strictConditions" : [
34
			{ "name" : "pidMatch", "fields" : [ "pid" ] },
35
			{ "name" : "exactMatch", "fields" : [ "resulttype" ] }
36
		],
37
		"conditions" : [
38
			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
39
			{ "name" : "sizeMatch", "fields" : [ "authors" ] }
40
		],
41
		"model" : [
42
			{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
43
			{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
44
			{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
45
			{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" },
46
			{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "result/metadata/resulttype/classid" }
47
		],
48
		"blacklists" : {
49

  
50
		}
51
	}
52
}
53
			</DEDUPLICATION>
54
		</CONFIGURATION>
55
		<STATUS>
56
			<LAST_UPDATE value="2001-12-31T12:00:00"/>
57
		</STATUS>
58
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
59
	</BODY>
60
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.publication.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8bag_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
		<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
		<RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2018-07-25T15:04:07+00:00"/>
8
	</HEADER>
9
	<BODY>
10
		<CONFIGURATION>
11
			<DESCRIPTION>1 - Publication: Match against the title, whose numbers must match</DESCRIPTION>
12
			<DEDUPLICATION>
13
{
14
	"wf" : {
15
		"threshold" : "0.99",
16
		"dedupRun" : "001",
17
		"entityType" : "result",
18
		"subEntityType" : "resulttype",
19
		"subEntityValue" : "publication",
20
		"orderField" : "title",
21
		"queueMaxSize" : "2000",
22
		"groupMaxSize" : "10",
23
		"slidingWindowSize" : "200",
24
		"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith" ],
25
		"includeChildren" : "true"
26
	},
27
	"pace" : {
28
		"clustering" : [
29
			{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
30
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
31
			{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
32
		],
33
		"strictConditions" : [
34
			{ "name" : "pidMatch", "fields" : [ "pid" ] },
35
			{ "name" : "exactMatch", "fields" : [ "resulttype" ] }
36
		],
37
		"conditions" : [
38
			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
39
			{ "name" : "sizeMatch", "fields" : [ "authors" ] }
40
		],
41
		"model" : [
42
			{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
43
			{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
44
			{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
45
			{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" },
46
			{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "result/metadata/resulttype/classid" }
47
		],
48
		"blacklists" : {
49
			"title" : [
50
				"^Inside Front Cover$",
51
				"(?i)^Poster presentations$",
52
				"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
53
				"^Problems with perinatal pathology\.?$",
54
				"(?i)^Cases? of Puerperal Convulsions$",
55
				"(?i)^Operative Gyna?ecology$",
56
				"(?i)^Mind the gap\!?\:?$",
57
				"^Chronic fatigue syndrome\.?$",
58
				"^Cartas? ao editor Letters? to the Editor$",
59
				"^Note from the Editor$",
60
				"^Anesthesia Abstract$",
61

  
62
				"^Annual report$",
63
				"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
64
				"(?i)^Graph and Table of Infectious Diseases?$",
65
				"^Presentation$",
66
				"(?i)^Reviews and Information on Publications$",
67
				"(?i)^PUBLIC HEALTH SERVICES?$",
68
				"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
69
				"(?i)^Adrese autora$",
70
				"(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
71
				"(?i)^Acknowledgement to Referees$",
72
				"(?i)^Behçet's disease\.?$",
73
				"(?i)^Isolation and identification of restriction endonuclease.*$",
74
				"(?i)^CEREBROVASCULAR DISEASES?.?$",
75
				"(?i)^Screening for abdominal aortic aneurysms?\.?$",
76
				"^Event management$",
77
				"(?i)^Breakfast and Crohn's disease.*\.?$",
78
				"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
79
				"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
80
				"^Gushi hakubutsugaku$",
81

  
82
				"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
83
				"^Intestinal spirocha?etosis$",
84
				"^Treatment of Rodent Ulcer$",
85
				"(?i)^\W*Cloud Computing\W*$",
86
				"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
87
				"^Free Communications, Poster Presentations: Session [A-F]$",
88

  
89
				"^“The Historical Aspects? of Quackery\.?”$",
90
				"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
91
				"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
92
				"(?i)^Case Report$",
93
				"^Boletín Informativo$",
94
				"(?i)^Glioblastoma Multiforme$",
95
				"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
96
				"^Zaměstnanecké výhody$",
97
				"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
98
				"(?i)^Carotid body tumours?\\.?$",
99
				"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
100
				"^Avant-propos$",
101
				"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
102
				"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
103
				"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
104
				"^Viñetas de Cortázar$",
105
				"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
106
				"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
107
				"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
108
				"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
109

  
110
				"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
111
				"^Aus der AGMB$",
112

  
113
				"^Znanstveno-stručni prilozi$",
114
				"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
115
				"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
116
				"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
117
				"^Finanční analýza podniku$",
118
				"^Financial analysis( of business)?$",
119
				"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
120
				"^Jikken nihon shūshinsho$",
121
				"(?i)^CORONER('|s)(s|') INQUESTS$",
122
				"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
123
				"(?i)^Consultants' contract(s)?$",
124
				"(?i)^Upute autorima$",
125
				"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
126
				"^Joshi shin kokubun$",
127
				"^Kōtō shōgaku dokuhon nōson'yō$",
128
				"^Jinjō shōgaku shōka$",
129
				"^Shōgaku shūjichō$",
130
				"^Nihon joshi dokuhon$",
131
				"^Joshi shin dokuhon$",
132
				"^Chūtō kanbun dokuhon$",
133
				"^Wabun dokuhon$",
134
				"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
135
				"(?i)^cardiac rehabilitation$",
136
				"(?i)^Analytical summary$",
137
				"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
138
				"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
139
				"^Prikazi i osvrti$",
140
				"^Rodinný dům s provozovnou$",
141
				"^Family house with an establishment$",
142
				"^Shinsei chūtō shin kokugun$",
143
				"^Pulmonary alveolar proteinosis(\\.?)$",
144
				"^Shinshū kanbun$",
145
				"^Viñeta(s?) de Rodríguez$",
146
				"(?i)^RUBRIKA UREDNIKA$",
147
				"^A Matching Model of the Academic Publication Market$",
148
				"^Yōgaku kōyō$",
149

  
150
				"^Internetový marketing$",
151
				"^Internet marketing$",
152
				"^Chūtō kokugo dokuhon$",
153
				"^Kokugo dokuhon$",
154
				"^Antibiotic Cover for Dental Extraction(s?)$",
155
				"^Strategie podniku$",
156
				"^Strategy of an Enterprise$",
157
				"(?i)^respiratory disease(s?)(\.?)$",
158
				"^Award(s?) for Gallantry in Civil Defence$",
159
				"^Podniková kultura$",
160
				"^Corporate Culture$",
161
				"^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
162
				"^Pracovní motivace$",
163
				"^Work Motivation$",
164
				"^Kaitei kōtō jogaku dokuhon$",
165
				"^Konsolidovaná účetní závěrka$",
166
				"^Consolidated Financial Statements$",
167
				"(?i)^intracranial tumour(s?)$",
168
				"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
169
				"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
170
				"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
171
				"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
172
				"^Úroveň motivačního procesu jako způsobu vedení lidí$",
173
				"^The level of motivation process as a leadership$",
174
				"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
175
				"(?i)^news and events$",
176
				"(?i)^NOVOSTI I DOGAĐAJI$",
177
				"^Sansū no gakushū$",
178
				"^Posouzení informačního systému firmy a návrh změn$",
179
				"^Information System Assessment and Proposal for ICT Modification$",
180
				"^Stresové zatížení pracovníků ve vybrané profesi$",
181
				"^Stress load in a specific job$",
182

  
183
				"^Sunday: Poster Sessions, Pt.*$",
184
				"^Monday: Poster Sessions, Pt.*$",
185
				"^Wednesday: Poster Sessions, Pt.*",
186
				"^Tuesday: Poster Sessions, Pt.*$",
187

  
188
				"^Analýza reklamy$",
189
				"^Analysis of advertising$",
190

  
191
				"^Shōgaku shūshinsho$",
192
				"^Shōgaku sansū$",
193
				"^Shintei joshi kokubun$",
194
				"^Taishō joshi kokubun dokuhon$",
195
				"^Joshi kokubun$",
196

  
197
				"^Účetní uzávěrka a účetní závěrka v ČR$",
198
				"(?i)^The \"?Causes\"? of Cancer$",
199
				"^Normas para la publicación de artículos$",
200
				"^Editor('|s)(s|') [Rr]eply$",
201
				"^Editor(’|s)(s|’) letter$",
202
				"^Redaktoriaus žodis$",
203
				"^DISCUSSION ON THE PRECEDING PAPER$",
204
				"^Kōtō shōgaku shūshinsho jidōyō$",
205
				"^Shōgaku nihon rekishi$",
206
				"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
207
				"^Préface$",
208
				"^Occupational [Hh]ealth [Ss]ervices.$",
209
				"^In Memoriam Professor Toshiyuki TAKESHIMA$",
210
				"^Účetní závěrka ve vybraném podniku.*$",
211
				"^Financial statements in selected company$",
212
				"^Abdominal [Aa]ortic [Aa]neurysms.*$",
213
				"^Pseudomyxoma peritonei$",
214
				"^Kazalo autora$",
215

  
216
				"(?i)^uvodna riječ$",
217
				"^Motivace jako způsob vedení lidí$",
218
				"^Motivation as a leadership$",
219
				"^Polyfunkční dům$",
220
				"^Multi\\-funkcional building$",
221
				"^Podnikatelský plán$",
222
				"(?i)^Podnikatelský záměr$",
223
				"(?i)^Business Plan$",
224
				"^Oceňování nemovitostí$",
225
				"^Marketingová komunikace$",
226
				"^Marketing communication$",
227
				"^Sumario Analítico$",
228
				"^Riječ uredništva$",
229
				"^Savjetovanja i priredbe$",
230
				"^Índice$",
231
				"^(Starobosanski nadpisi).*$",
232
				"^Vzdělávání pracovníků v organizaci$",
233
				"^Staff training in organization$",
234
				"^(Life Histories of North American Geometridae).*$",
235
				"^Strategická analýza podniku$",
236
				"^Strategic Analysis of an Enterprise$",
237
				"^Sadržaj$",
238
				"^Upute suradnicima$",
239
				"^Rodinný dům$",
240
				"(?i)^Fami(l)?ly house$",
241
				"^Upute autorima$",
242
				"^Strategic Analysis$",
243
				"^Finanční analýza vybraného podniku$",
244
				"^Finanční analýza$",
245
				"^Riječ urednika$",
246
				"(?i)^Content(s?)$",
247
				"(?i)^Inhalt$",
248
				"^Jinjō shōgaku shūshinsho jidōyō$",
249
				"(?i)^Index$",
250
				"^Chūgaku kokubun kyōkasho$",
251
				"^Retrato de una mujer$",
252
				"^Retrato de un hombre$",
253
				"^Kōtō shōgaku dokuhon$",
254
				"^Shotōka kokugo$",
255
				"^Shōgaku dokuhon$",
256
				"^Jinjō shōgaku kokugo dokuhon$",
257
				"^Shinsei kokugo dokuhon$",
258
				"^Teikoku dokuhon$",
259
				"^Instructions to Authors$",
260
				"^KİTAP TAHLİLİ$",
261
				"^PRZEGLĄD PIŚMIENNICTWA$",
262
				"(?i)^Presentación$",
263
				"^İçindekiler$",
264
				"(?i)^Tabl?e of contents$",
265
				"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
266
				"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
267
				"^Editorial( Board)?$",
268
				"(?i)^Editorial \\(English\\)$",
269
				"^Editörden$",
270
				"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
271
				"^(Kiri Karl Morgensternile).*$",
272
				"^(\\[Eksliibris Aleksandr).*\\]$",
273
				"^(\\[Eksliibris Aleksandr).*$",
274
				"^(Eksliibris Aleksandr).*$",
275
				"^(Kiri A\\. de Vignolles).*$",
276
				"^(2 kirja Karl Morgensternile).*$",
277
				"^(Pirita kloostri idaosa arheoloogilised).*$",
278
				"^(Kiri tundmatule).*$",
279
				"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
280
				"^(Eksliibris Nikolai Birukovile).*$",
281
				"^(Eksliibris Nikolai Issakovile).*$",
282
				"^(WHP Cruise Summary Information of section).*$",
283
				"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
284
				"^(Measurement of the spin\\-dependent structure function).*",
285
				"(?i)^.*authors['’′]? reply\.?$",
286
				"(?i)^.*authors['’′]? response\.?$"
287
			]
288
		}
289
	}
290
}
291
			</DEDUPLICATION>
292
		</CONFIGURATION>
293
		<STATUS>
294
			<LAST_UPDATE value="2001-12-31T12:00:00"/>
295
		</STATUS>
296
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
297
	</BODY>
298
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.software.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8bac_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
		<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
		<RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2018-07-25T15:04:07+00:00"/>
8
	</HEADER>
9
	<BODY>
10
		<CONFIGURATION>
11
			<DESCRIPTION>4 - Software: Match against the title, whose numbers must match</DESCRIPTION>
12
			<DEDUPLICATION>
13
{
14
	"wf" : {
15
		"threshold" : "0.99",
16
		"dedupRun" : "001",
17
		"entityType" : "result",
18
		"subEntityType" : "resulttype",
19
		"subEntityValue" : "software",
20
		"orderField" : "title",
21
		"queueMaxSize" : "2000",
22
		"groupMaxSize" : "10",
23
		"slidingWindowSize" : "200",
24
		"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith" ],
25
		"includeChildren" : "true"
26
	},
27
	"pace" : {
28
		"clustering" : [
29
			{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
30
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
31
			{ "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } }
32
		],
33
		"strictConditions" : [
34
			{ "name" : "pidMatch", "fields" : [ "doi" ] },
35
			{ "name" : "exactMatch", "fields" : [ "resulttype", "url" ] }
36
		],
37
		"conditions" : [
38
			{ "name" : "titleVersionMatch", "fields" : [ "title" ] }
39
		],
40
		"model" : [
41
			{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
42
			{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
43
			{ "name" : "url", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/instance/url" },
44
			{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "result/metadata/resulttype/classid" }
45
		],
46
		"blacklists" : {
47

  
48
		}
49
	}
50
}
51
			</DEDUPLICATION>
52
		</CONFIGURATION>
53
		<STATUS>
54
			<LAST_UPDATE value="2001-12-31T12:00:00"/>
55
		</STATUS>
56
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
57
	</BODY>
58
</RESOURCE_PROFILE>
modules/dnet-openaireplus-profiles/trunk/src/main/resources/eu/dnetlib/test/profiles/DedupConfigurationDSResources/DedupConfigurationDSResourceType/result.orp.xml
1
<RESOURCE_PROFILE>
2
	<HEADER>
3
		<RESOURCE_IDENTIFIER value="c611ec67-eefc-4ffe-a5d4-cb3fc40a8bab_RGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZXMvRGVkdXBDb25maWd1cmF0aW9uRFNSZXNvdXJjZVR5cGU="/>
4
		<RESOURCE_TYPE value="DedupConfigurationDSResourceType"/>
5
		<RESOURCE_KIND value="DedupConfigurationDSResources"/>
6
		<RESOURCE_URI value=""/>
7
		<DATE_OF_CREATION value="2018-07-25T15:04:07+00:00"/>
8
	</HEADER>
9
	<BODY>
10
		<CONFIGURATION>
11
			<DESCRIPTION>3 - Other research product: group by DOI and match against the title, when PIDs are not available</DESCRIPTION>
12
			<DEDUPLICATION>
13
{
14
	"wf" : {
15
		"threshold" : "0.99",
16
		"dedupRun" : "001",
17
		"entityType" : "result",
18
		"subEntityType" : "resulttype",
19
		"subEntityValue" : "other",
20
		"orderField" : "title",
21
		"queueMaxSize" : "2000",
22
		"groupMaxSize" : "10",
23
		"slidingWindowSize" : "200",
24
		"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith" ],
25
		"includeChildren" : "true"
26
	},
27
	"pace" : {
28
		"clustering" : [
29
			{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
30
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
31
			{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
32
		],
33
		"strictConditions" : [
34
			{ "name" : "pidMatch", "fields" : [ "pid" ] },
35
			{ "name" : "exactMatch", "fields" : [ "resulttype" ] }
36
		],
37
		"conditions" : [
38
			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
39
			{ "name" : "sizeMatch", "fields" : [ "authors" ] }
40
		],
41
		"model" : [
42
			{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
43
			{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
44
			{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
45
			{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" },
46
			{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "result/metadata/resulttype/classid" }
47
		],
48
		"blacklists" : {
49

  
50
		}
51
	}
52
}
53
			</DEDUPLICATION>
54
		</CONFIGURATION>
55
		<STATUS>
56
			<LAST_UPDATE value="2001-12-31T12:00:00"/>
57
		</STATUS>
58
		<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
59
	</BODY>
60
</RESOURCE_PROFILE>

Also available in: Unified diff