Project

General

Profile

« Previous | Next » 

Revision 50230

aligned with the configuration on production

View differences:

result.prod.pace.conf
1
{ 
2
	"wf" : { 
3
        "threshold" : "0.99", 
4
        "dedupRun" : "001", 
5
        "entityType" : "result", 
6
        "orderField" : "title", 
7
        "queueMaxSize" : "2000",
8
        "groupMaxSize" : "10",
9
        "slidingWindowSize" : "200",
10
        "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
11
        "includeChildren" : "true" 
12
    },
13
	"pace" : {		
1
{
2
	"wf" : {
3
		"threshold" : "0.99",
4
		"dedupRun" : "001",
5
		"entityType" : "result",
6
		"orderField" : "title",
7
		"queueMaxSize" : "4000",
8
		"groupMaxSize" : "40",
9
		"slidingWindowSize" : "200",
10
		"rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
11
		"includeChildren" : "true",
12
		"maxChildren" : "40"
13
	},
14
	"pace" : {
14 15
		"clustering" : [
15 16
			{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
16
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } 
17
			{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
18
			{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
17 19
		],
18 20
		"strictConditions" : [
19
			{ "name" : "exactMatch", "fields" : [ "pid" ] }
20
		], 
21
  		"conditions" : [ 
22
  			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
23
  			{ "name" : "sizeMatch", "fields" : [ "authors" ] }
24
  		],		
21
			{ "name" : "pidMatch", "fields" : [ "pid" ] }
22
		],
23
		"conditions" : [
24
			{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
25
			{ "name" : "sizeMatch", "fields" : [ "authors" ] }
26
		],
25 27
		"model" : [
26
			{ "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, 	
28
			{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
29
			{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
27 30
			{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
28
			{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
31
			{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }
29 32
		],
30
		"blacklists" : { 
33
		"blacklists" : {
31 34
			"title" : [
32 35
				"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
33 36
				"^Problems with perinatal pathology\.?$",
......
38 41
				"^Cartas? ao editor Letters? to the Editor$",
39 42
				"^Note from the Editor$",
40 43
				"^Anesthesia Abstract$",
41
				
44

  
42 45
				"^Annual report$",
43 46
				"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
44 47
				"(?i)^Graph and Table of Infectious Diseases?$",
......
58 61
				"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
59 62
				"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
60 63
				"^Gushi hakubutsugaku$",
61
			
62
				"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",							
64

  
65
				"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
63 66
				"^Intestinal spirocha?etosis$",
64 67
				"^Treatment of Rodent Ulcer$",
65 68
				"(?i)^\W*Cloud Computing\W*$",
66
				"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",				
69
				"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
67 70
				"^Free Communications, Poster Presentations: Session [A-F]$",
68
				
71

  
69 72
				"^“The Historical Aspects? of Quackery\.?”$",
70 73
				"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
71 74
				"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
72
				"(?i)^Case Report$",							
75
				"(?i)^Case Report$",
73 76
				"^Boletín Informativo$",
74 77
				"(?i)^Glioblastoma Multiforme$",
75 78
				"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
76 79
				"^Zaměstnanecké výhody$",
77 80
				"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
78
				"(?i)^Carotid body tumours?\\.?$", 
81
				"(?i)^Carotid body tumours?\\.?$",
79 82
				"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
80 83
				"^Avant-propos$",
81 84
				"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
82 85
				"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
83
				"(?i)^PUBLIC HEALTH VERSUS THE STATE$",							
86
				"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
84 87
				"^Viñetas de Cortázar$",
85 88
				"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
86
				"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",				
89
				"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
87 90
				"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
88 91
				"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
89
			
92

  
90 93
				"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
91
				"^Aus der AGMB$",				
92
			
94
				"^Aus der AGMB$",
95

  
93 96
				"^Znanstveno-stručni prilozi$",
94
				"^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
95
				"^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
97
				"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
98
				"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
99
				"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
96 100
				"^Finanční analýza podniku$",
97 101
				"^Financial analysis( of business)?$",
98 102
				"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
99 103
				"^Jikken nihon shūshinsho$",
100 104
				"(?i)^CORONER('|s)(s|') INQUESTS$",
101
				"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",				
105
				"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
102 106
				"(?i)^Consultants' contract(s)?$",
103 107
				"(?i)^Upute autorima$",
104 108
				"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
......
114 118
				"(?i)^cardiac rehabilitation$",
115 119
				"(?i)^Analytical summary$",
116 120
				"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
117
				"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", 
121
				"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
118 122
				"^Prikazi i osvrti$",
119 123
				"^Rodinný dům s provozovnou$",
120 124
				"^Family house with an establishment$",
......
125 129
				"(?i)^RUBRIKA UREDNIKA$",
126 130
				"^A Matching Model of the Academic Publication Market$",
127 131
				"^Yōgaku kōyō$",
128
			
132

  
129 133
				"^Internetový marketing$",
130 134
				"^Internet marketing$",
131 135
				"^Chūtō kokugo dokuhon$",
132 136
				"^Kokugo dokuhon$",
133 137
				"^Antibiotic Cover for Dental Extraction(s?)$",
134
				"^Strategie podniku$",				
138
				"^Strategie podniku$",
135 139
				"^Strategy of an Enterprise$",
136 140
				"(?i)^respiratory disease(s?)(\.?)$",
137 141
				"^Award(s?) for Gallantry in Civil Defence$",
......
158 162
				"^Information System Assessment and Proposal for ICT Modification$",
159 163
				"^Stresové zatížení pracovníků ve vybrané profesi$",
160 164
				"^Stress load in a specific job$",
161
				
165

  
162 166
				"^Sunday: Poster Sessions, Pt.*$",
163 167
				"^Monday: Poster Sessions, Pt.*$",
164 168
				"^Wednesday: Poster Sessions, Pt.*",
165 169
				"^Tuesday: Poster Sessions, Pt.*$",
166
				
170

  
167 171
				"^Analýza reklamy$",
168 172
				"^Analysis of advertising$",
169
			
173

  
170 174
				"^Shōgaku shūshinsho$",
171 175
				"^Shōgaku sansū$",
172 176
				"^Shintei joshi kokubun$",
173 177
				"^Taishō joshi kokubun dokuhon$",
174
				"^Joshi kokubun$",				
175
												
178
				"^Joshi kokubun$",
179

  
176 180
				"^Účetní uzávěrka a účetní závěrka v ČR$",
177 181
				"(?i)^The \"?Causes\"? of Cancer$",
178 182
				"^Normas para la publicación de artículos$",
179 183
				"^Editor('|s)(s|') [Rr]eply$",
180 184
				"^Editor(’|s)(s|’) letter$",
181
				"^Redaktoriaus žodis$",		
185
				"^Redaktoriaus žodis$",
182 186
				"^DISCUSSION ON THE PRECEDING PAPER$",
183 187
				"^Kōtō shōgaku shūshinsho jidōyō$",
184 188
				"^Shōgaku nihon rekishi$",
......
190 194
				"^Financial statements in selected company$",
191 195
				"^Abdominal [Aa]ortic [Aa]neurysms.*$",
192 196
				"^Pseudomyxoma peritonei$",
193
				"^Kazalo autora$",			
194
			
197
				"^Kazalo autora$",
198

  
195 199
				"(?i)^uvodna riječ$",
196 200
				"^Motivace jako způsob vedení lidí$",
197 201
				"^Motivation as a leadership$",
198 202
				"^Polyfunkční dům$",
199 203
				"^Multi\\-funkcional building$",
200 204
				"^Podnikatelský plán$",
201
				"^Business Plan$",
205
				"(?i)^Podnikatelský záměr$",
206
				"(?i)^Business Plan$",
202 207
				"^Oceňování nemovitostí$",
203 208
				"^Marketingová komunikace$",
204 209
				"^Marketing communication$",
......
244 249
				"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
245 250
				"^Editorial( Board)?$",
246 251
				"(?i)^Editorial \\(English\\)$",
247
				"^Editörden$",			
252
				"^Editörden$",
248 253
				"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
249 254
				"^(Kiri Karl Morgensternile).*$",
250 255
				"^(\\[Eksliibris Aleksandr).*\\]$",
......
259 264
				"^(Eksliibris Nikolai Issakovile).*$",
260 265
				"^(WHP Cruise Summary Information of section).*$",
261 266
				"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
262
				"^(Measurement of the spin\\-dependent structure function).*"
267
				"^(Measurement of the spin\\-dependent structure function).*",
268
				"(?i)^.*authors['’′]? reply\.?$",
269
				"(?i)^.*authors['’′]? response\.?$"
263 270
			]
264
		} 		
271
		}
265 272
	}
266 273
}

Also available in: Unified diff