Project

General

Profile

1 26600 sandro.lab
package eu.dnetlib.data.mapreduce.util;
2
3
import static eu.dnetlib.miscutils.collections.MappedCollection.listMap;
4
5
import java.io.StringReader;
6
import java.io.StringWriter;
7
import java.util.List;
8
import java.util.Map;
9
import java.util.Map.Entry;
10
import java.util.Set;
11
import java.util.StringTokenizer;
12
13
import javax.xml.transform.OutputKeys;
14
import javax.xml.transform.Transformer;
15
import javax.xml.transform.TransformerConfigurationException;
16
import javax.xml.transform.TransformerException;
17
import javax.xml.transform.TransformerFactory;
18
import javax.xml.transform.TransformerFactoryConfigurationError;
19
import javax.xml.transform.dom.DOMSource;
20
import javax.xml.transform.stream.StreamResult;
21
22
import org.apache.commons.lang.StringUtils;
23
import org.dom4j.Document;
24
import org.dom4j.DocumentException;
25
import org.dom4j.Element;
26
import org.dom4j.io.SAXReader;
27
import org.json.JSONException;
28
import org.json.JSONObject;
29
30
import com.google.common.base.Predicate;
31
import com.google.common.collect.Iterators;
32
import com.google.common.collect.Lists;
33
import com.google.common.collect.Maps;
34
import com.google.common.collect.Sets;
35
import com.google.protobuf.Descriptors.EnumValueDescriptor;
36
import com.google.protobuf.Descriptors.FieldDescriptor;
37
import com.google.protobuf.GeneratedMessage;
38
import com.mycila.xmltool.XMLDoc;
39
import com.mycila.xmltool.XMLTag;
40
41
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextDef;
42
import eu.dnetlib.data.mapreduce.hbase.index.config.ContextMapper;
43
import eu.dnetlib.data.mapreduce.hbase.index.config.EntityConfigTable;
44 28226 claudio.at
import eu.dnetlib.data.mapreduce.hbase.index.config.LinkDescriptor;
45 28094 claudio.at
import eu.dnetlib.data.mapreduce.hbase.index.config.RelClasses;
46
import eu.dnetlib.data.proto.FieldTypeProtos.DataInfo;
47
import eu.dnetlib.data.proto.FieldTypeProtos.ExtraInfo;
48
import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue;
49
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
50
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
51
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
52 26600 sandro.lab
import eu.dnetlib.data.proto.OafProtos.OafEntity;
53
import eu.dnetlib.data.proto.OafProtos.OafRel;
54
import eu.dnetlib.data.proto.PersonProtos.Person;
55
import eu.dnetlib.data.proto.ProjectProtos.Project;
56
import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
57
import eu.dnetlib.data.proto.ResultProtos.Result;
58
import eu.dnetlib.data.proto.ResultProtos.Result.Context;
59
import eu.dnetlib.data.proto.ResultProtos.Result.ExternalReference;
60
import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
61
import eu.dnetlib.data.proto.ResultProtos.Result.Journal;
62
import eu.dnetlib.data.proto.TypeProtos.Type;
63
import eu.dnetlib.miscutils.functional.UnaryFunction;
64
65
public class XmlRecordFactory {
66
67
	protected Set<String> specialDatasourceTypes = Sets.newHashSet("scholarcomminfra", "infospace", "pubsrepository::mock", "entityregistry");
68
69
	protected TemplateFactory templateFactory = new TemplateFactory();
70
71
	protected OafDecoder mainEntity = null;
72
73
	protected String key = null;
74
75 28226 claudio.at
	protected List<OafDecoder> relations = Lists.newLinkedList();
76
	protected List<OafDecoder> children = Lists.newLinkedList();
77 26600 sandro.lab
78
	protected EntityConfigTable entityConfigTable;
79
80
	protected ContextMapper contextMapper;
81
82 28094 claudio.at
	protected RelClasses relClasses;
83
84 26600 sandro.lab
	protected String schemaLocation;
85
86
	protected boolean entityDefaults;
87
	protected boolean relDefaults;
88
	protected boolean childDefaults;
89
90
	protected Set<String> contextes = Sets.newHashSet();
91
92 28094 claudio.at
	protected List<String> extraInfo = Lists.newArrayList();
93 26600 sandro.lab
94 30968 claudio.at
	protected Map<String, Integer> counters = Maps.newHashMap();
95
96 26600 sandro.lab
	protected Transformer transformer;
97
98 28226 claudio.at
	public XmlRecordFactory(final EntityConfigTable entityConfigTable, final ContextMapper contextMapper, final RelClasses relClasses,
99
			final String schemaLocation, final boolean entityDefaults, final boolean relDefaults, final boolean childDefeaults)
100 33382 claudio.at
			throws TransformerConfigurationException, TransformerFactoryConfigurationError {
101 26600 sandro.lab
		this.entityConfigTable = entityConfigTable;
102
		this.contextMapper = contextMapper;
103 28094 claudio.at
		this.relClasses = relClasses;
104 26600 sandro.lab
		this.schemaLocation = schemaLocation;
105
		this.entityDefaults = entityDefaults;
106
		this.relDefaults = relDefaults;
107
		this.childDefaults = childDefeaults;
108
109
		transformer = TransformerFactory.newInstance().newTransformer();
110
		transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
111
	}
112
113
	public String getId() {
114
		return key;
115
	}
116
117
	public boolean isValid() {
118
		return mainEntity != null;
119
	}
120
121
	public void setMainEntity(final OafDecoder mainEntity) {
122
		this.mainEntity = mainEntity;
123
		this.key = mainEntity.decodeEntity().getId();
124
	}
125
126
	public void addRelation(final OafDecoder rel) {
127
		addRelOrChild(relations, rel);
128
	}
129
130
	public void addChild(final OafDecoder child) {
131
		addRelOrChild(children, child);
132
	}
133
134 28226 claudio.at
	private void addRelOrChild(final List<OafDecoder> list, final OafDecoder decoder) {
135
		list.add(decoder);
136 26600 sandro.lab
	}
137
138
	public String build() {
139
140
		OafEntityDecoder entity = mainEntity.decodeEntity();
141
		// System.out.println("building");
142
		// System.out.println("main: " + mainEntity);
143
		// System.out.println("rel:  " + relations);
144
		// System.out.println("chi:  " + children);
145
		// System.out.println("=============");
146
147
		final Type type = entity.getType();
148
		final List<String> metadata = decodeType(entity, null, entityDefaults, false);
149 28226 claudio.at
150
		// rels has to be processed before the contexts because they enrich the contextMap with the funding info.
151
		List<String> rels = listRelations();
152 26600 sandro.lab
		metadata.addAll(buildContexts(type));
153
		metadata.add(parseDataInfo(mainEntity));
154
155 31186 alessia.ba
		final String body = templateFactory.buildBody(type, metadata, rels, listChildren(), extraInfo);
156 28226 claudio.at
157 26600 sandro.lab
		// System.out.println("record id: " + recordId);
158 31186 alessia.ba
		return templateFactory.buildRecord(type, key, entity.getDateOfCollection(), schemaLocation, body, countersAsXml());
159 26600 sandro.lab
	}
160
161
	private String parseDataInfo(final OafDecoder decoder) {
162
		DataInfo dataInfo = decoder.getOaf().getDataInfo();
163
164
		StringBuilder sb = new StringBuilder();
165
		sb.append("<datainfo>");
166 28094 claudio.at
		sb.append(asXmlElement("inferred", dataInfo.getInferred() + "", null, null));
167
		sb.append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "", null, null));
168
		sb.append(asXmlElement("trust", dataInfo.getTrust() + "", null, null));
169
		sb.append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "", null, null));
170
		sb.append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null));
171 26600 sandro.lab
		sb.append("</datainfo>");
172
173
		return sb.toString();
174
	}
175
176
	private List<String> decodeType(final OafEntityDecoder decoder, final Set<String> filter, final boolean defaults, final boolean expandingRel) {
177
178
		final List<String> metadata = Lists.newArrayList();
179
		metadata.addAll(listFields(decoder.getMetadata(), filter, defaults, expandingRel));
180
		metadata.addAll(listFields(decoder.getOafEntity(), filter, defaults, expandingRel));
181
182 33382 claudio.at
		if (decoder.getEntity() instanceof Result && !expandingRel) {
183 28094 claudio.at
			metadata.add(asXmlElement("bestlicense", "", getBestLicense(), null));
184 26600 sandro.lab
185
			metadata.addAll(listFields(decoder.getEntity(), filter, defaults, expandingRel));
186
		}
187 33382 claudio.at
		if (decoder.getEntity() instanceof Person && !expandingRel) {
188 26600 sandro.lab
			metadata.addAll(listFields(decoder.getEntity(), filter, defaults, expandingRel));
189
		}
190 33382 claudio.at
		if (decoder.getEntity() instanceof Project && !expandingRel) {
191 26600 sandro.lab
			metadata.addAll(listFields(decoder.getEntity(), filter, defaults, expandingRel));
192
		}
193
194
		return metadata;
195
	}
196
197
	private Qualifier getBestLicense() {
198 31409 claudio.at
		Qualifier bestLicense = getQualifier("UNKNOWN", "not available", "dnet:access_modes");
199 26600 sandro.lab
		LicenseComparator lc = new LicenseComparator();
200
		for (Instance instance : ((Result) mainEntity.decodeEntity().getEntity()).getInstanceList()) {
201
			if (lc.compare(bestLicense, instance.getLicence()) > 0) {
202
				bestLicense = instance.getLicence();
203
			}
204
		}
205
		return bestLicense;
206
	}
207
208 31409 claudio.at
	public Qualifier getQualifier(final String classid, final String classname, final String schemename) {
209
		return Qualifier.newBuilder().setClassid(classid).setClassname(classname).setSchemeid(schemename).setSchemename(schemename).build();
210
	}
211
212 26600 sandro.lab
	private List<String> listRelations() {
213
214
		final List<String> rels = Lists.newArrayList();
215
216 28226 claudio.at
		for (OafDecoder decoder : this.relations) {
217 26600 sandro.lab
218
			final OafRel rel = decoder.getOafRel();
219
			final OafEntity cachedTarget = rel.getCachedTarget();
220
			final OafRelDecoder relDecoder = OafRelDecoder.decode(rel);
221
222 28094 claudio.at
			// if (!relDecoder.getRelType().equals(RelType.personResult) || relDecoder.getRelTargetId().equals(key)) {
223
			if (relDecoder.getRelSourceId().equals(key) || relDecoder.getRelTargetId().equals(key)) {
224 26600 sandro.lab
225
				final List<String> metadata = Lists.newArrayList();
226 28094 claudio.at
				Type targetType = relDecoder.getTargetType(mainEntity.getEntity().getType());
227
				Set<String> relFilter = entityConfigTable.getFilter(targetType, relDecoder.getRelDescriptor());
228
				metadata.addAll(listFields(relDecoder.getSubRel(), relFilter, false, true));
229 26600 sandro.lab
230
				String semanticclass = "";
231
				String semanticscheme = "";
232
233 28226 claudio.at
				RelDescriptor relDescriptor = relDecoder.getRelDescriptor();
234
235 33382 claudio.at
				if (cachedTarget != null && cachedTarget.isInitialized()) {
236 28226 claudio.at
237 28094 claudio.at
					final Set<String> filter = entityConfigTable.getFilter(targetType, relDescriptor);
238 26600 sandro.lab
					metadata.addAll(decodeType(OafEntityDecoder.decode(cachedTarget), filter, relDefaults, true));
239 28094 claudio.at
				}
240 26600 sandro.lab
241 28094 claudio.at
				RelMetadata relMetadata = relDecoder.getRelMetadata();
242
				// debug
243
				if (relMetadata == null) {
244
					// System.err.println(this);
245
					semanticclass = semanticscheme = "UNKNOWN";
246 26600 sandro.lab
				} else {
247 28094 claudio.at
					semanticclass = relClasses.getInverse(relMetadata.getSemantics().getClassname());
248
					semanticscheme = relMetadata.getSemantics().getSchemename();
249 26600 sandro.lab
				}
250
251 30968 claudio.at
				incrementCounter(relDescriptor.getSubRelType().toString());
252 28226 claudio.at
253
				LinkDescriptor ld = entityConfigTable.getDescriptor(relDecoder.getTargetType(mainEntity.getEntity().getType()), relDescriptor);
254
255 33382 claudio.at
				String relId = ld != null && !ld.isSymmetric() ? relDecoder.getRelTargetId() : relDecoder.getRelSourceId();
256 28226 claudio.at
257 30968 claudio.at
				DataInfo info = decoder.getOaf().getDataInfo();
258
259 28226 claudio.at
				rels.add(templateFactory.getRel(targetType, relId, metadata, semanticclass, semanticscheme, info.getInferred(), info.getTrust(),
260
						info.getInferenceprovenance(), info.getProvenanceaction().getClassid()));
261 26600 sandro.lab
			}
262
		}
263
		return rels;
264
	}
265
266
	private List<String> listChildren() {
267
268
		final List<String> children = Lists.newArrayList();
269 28226 claudio.at
		for (OafDecoder decoder : this.children) {
270 30968 claudio.at
			OafEntity cachedTarget = decoder.getOafRel().getCachedTarget();
271
			addChildren(children, cachedTarget, decoder.getRelDescriptor());
272 26600 sandro.lab
		}
273
		OafEntityDecoder entity = mainEntity.decodeEntity();
274
		if (entity.getType().equals(Type.result)) {
275
			for (Instance instance : ((Result) entity.getEntity()).getInstanceList()) {
276
				children.add(templateFactory.getInstance(instance.getHostedby().getKey(), listFields(instance, null, false, false),
277
						listMap(instance.getUrlList(), new UnaryFunction<String, String>() {
278
279
							@Override
280
							public String evaluate(final String identifier) {
281
								return templateFactory.getWebResource(identifier);
282
							}
283
						})));
284
			}
285
			for (ExternalReference er : ((Result) entity.getEntity()).getExternalReferenceList()) {
286
				// Set<String> filters = entityConfigTable.getFilter(Type.result, RelType.resultResult);
287
				List<String> fields = listFields(er, null, false, false);
288
				children.add(templateFactory.getChild("externalreference", null, fields));
289
			}
290
		}
291
292
		return children;
293
	}
294
295 28094 claudio.at
	private void addChildren(final List<String> children, final OafEntity target, final RelDescriptor relDescriptor) {
296 26600 sandro.lab
		final OafEntityDecoder decoder = OafEntityDecoder.decode(target);
297 30968 claudio.at
		incrementCounter(relDescriptor.getSubRelType().toString());
298 28094 claudio.at
		Set<String> filters = entityConfigTable.getFilter(target.getType(), relDescriptor);
299 26600 sandro.lab
		children.add(templateFactory.getChild(decoder.getType().toString(), decoder.getId(), listFields(decoder.getMetadata(), filters, childDefaults, false)));
300
	}
301
302
	// //////////////////////////////////
303
304
	private List<String> listFields(final GeneratedMessage fields, final Set<String> filter, final boolean defaults, final boolean expandingRel) {
305
306
		final List<String> metadata = Lists.newArrayList();
307
308
		if (fields != null) {
309
310
			Set<String> seen = Sets.newHashSet();
311
			for (Entry<FieldDescriptor, Object> e : filterFields(fields, filter)) {
312
313
				// final String name = getFieldName(e.getKey().getName());
314
				final String name = e.getKey().getName();
315
				seen.add(name);
316
317
				addFieldValue(metadata, e.getKey(), e.getValue(), expandingRel);
318
			}
319
320
			if (defaults) {
321
				for (FieldDescriptor fd : fields.getDescriptorForType().getFields()) {
322
					if (!seen.contains(fd.getName())) {
323
						addFieldValue(metadata, fd, getDefault(fd), expandingRel);
324
					}
325
				}
326
			}
327
		}
328
		return metadata;
329
	}
330
331
	private Object getDefault(final FieldDescriptor fd) {
332
		switch (fd.getType()) {
333
		case BOOL:
334
			return false;
335
		case BYTES:
336
			return "".getBytes();
337
		case MESSAGE: {
338 33382 claudio.at
			if (Qualifier.getDescriptor().equals(fd.getMessageType())) { return defaultQualifier(); }
339
			if (StructuredProperty.getDescriptor().equals(fd.getMessageType())) { return StructuredProperty.newBuilder().setValue("")
340
					.setQualifier(defaultQualifier()).build(); }
341
			if (KeyValue.getDescriptor().equals(fd.getMessageType())) { return KeyValue.newBuilder().setKey("").setValue("").build(); }
342
			if (StringField.getDescriptor().equals(fd.getMessageType())) { return StringField.newBuilder().setValue("").build(); }
343 26600 sandro.lab
			return null;
344
		}
345
		case SFIXED32:
346
		case SFIXED64:
347
		case SINT32:
348
		case SINT64:
349
		case INT32:
350
		case INT64:
351
		case DOUBLE:
352
		case FIXED32:
353
		case FIXED64:
354
		case FLOAT:
355
			return 0;
356
		case STRING:
357
			return "";
358
		default:
359
			return null;
360
		}
361
	}
362
363
	private Qualifier defaultQualifier() {
364
		return Qualifier.newBuilder().setClassid("").setClassname("").setSchemeid("").setSchemename("").build();
365
	}
366
367
	@SuppressWarnings("unchecked")
368
	private void addFieldValue(final List<String> metadata, final FieldDescriptor fd, final Object value, final boolean expandingRel) {
369 33382 claudio.at
		if (fd.getName().equals("dateofcollection") || fd.getName().equals("id") || fd.getName().equals("url") || value == null) { return; }
370 26600 sandro.lab
371
		if (fd.getName().equals("datasourcetype")) {
372
			String classid = ((Qualifier) value).getClassid();
373
374
			Qualifier.Builder q = Qualifier.newBuilder((Qualifier) value);
375
			if (specialDatasourceTypes.contains(classid)) {
376
				q.setClassid("other").setClassname("other");
377
			}
378 28094 claudio.at
			metadata.add(asXmlElement("datasourcetypeui", "", q.build(), null));
379 26600 sandro.lab
		}
380
381 33382 claudio.at
		if (fd.isRepeated() && value instanceof List<?>) {
382 26600 sandro.lab
			for (Object o : (List<Object>) value) {
383
				guessType(metadata, fd, o, expandingRel);
384
			}
385
		} else {
386
			guessType(metadata, fd, value, expandingRel);
387
		}
388
	}
389
390
	private void guessType(final List<String> metadata, final FieldDescriptor fd, final Object o, final boolean expandingRel) {
391
392
		if (fd.getType().equals(FieldDescriptor.Type.MESSAGE)) {
393
394
			if (Qualifier.getDescriptor().equals(fd.getMessageType())) {
395
				Qualifier qualifier = (Qualifier) o;
396 28094 claudio.at
				metadata.add(asXmlElement(fd.getName(), "", qualifier, null));
397 26600 sandro.lab
			}
398
399
			if (StructuredProperty.getDescriptor().equals(fd.getMessageType())) {
400
				StructuredProperty sp = (StructuredProperty) o;
401 28094 claudio.at
				metadata.add(asXmlElement(fd.getName(), sp.getValue(), sp.getQualifier(), sp.hasDataInfo() ? sp.getDataInfo() : null));
402 26600 sandro.lab
			}
403
404
			if (KeyValue.getDescriptor().equals(fd.getMessageType())) {
405
				KeyValue kv = (KeyValue) o;
406
				metadata.add("<" + fd.getName() + " name=\"" + escapeXml(kv.getValue()) + "\" id=\"" + escapeXml(removePrefix(kv.getKey())) + "\"/>");
407
			}
408
409 28094 claudio.at
			if (StringField.getDescriptor().equals(fd.getMessageType())) {
410
				if (fd.getName().contains("fundingtree")) {
411
					handleFundingTree(metadata, fd, o, expandingRel);
412
				} else {
413
					StringField sf = (StringField) o;
414
					StringBuilder sb = new StringBuilder("<" + fd.getName());
415
					if (sf.hasDataInfo()) {
416
						DataInfo dataInfo = sf.getDataInfo();
417
						dataInfoAsAttributes(sb, dataInfo);
418
					}
419
					sb.append(">" + escapeXml(sf.getValue()) + "</" + fd.getName() + ">");
420
					metadata.add(sb.toString());
421
				}
422
			}
423
424 33382 claudio.at
			if (Journal.getDescriptor().equals(fd.getMessageType()) && o != null) {
425 26600 sandro.lab
				Journal j = (Journal) o;
426
				metadata.add("<journal " + "issn=\"" + escapeXml(j.getIssnPrinted()) + "\" " + "eissn=\"" + escapeXml(j.getIssnOnline()) + "\" " + "lissn=\""
427
						+ escapeXml(j.getIssnLinking()) + "\">" + escapeXml(j.getName()) + "</journal>");
428
			}
429
430 33382 claudio.at
			if (Context.getDescriptor().equals(fd.getMessageType()) && o != null) {
431 26600 sandro.lab
				contextes.add(((Result.Context) o).getId());
432
			}
433
434 33382 claudio.at
			if (ExtraInfo.getDescriptor().equals(fd.getMessageType()) && o != null) {
435 26600 sandro.lab
436 28094 claudio.at
				ExtraInfo e = (ExtraInfo) o;
437
				StringBuilder sb = new StringBuilder("<" + fd.getName() + " ");
438
439
				sb.append("name=\"" + e.getName() + "\" ");
440
				sb.append("typology=\"" + e.getTypology() + "\" ");
441
				sb.append("provenance=\"" + e.getProvenance() + "\" ");
442
				sb.append("trust=\"" + e.getTrust() + "\"");
443
				sb.append(">");
444 30827 claudio.at
				sb.append(e.getValue());
445 26600 sandro.lab
				sb.append("</" + fd.getName() + ">");
446 28094 claudio.at
447
				extraInfo.add(sb.toString());
448 26600 sandro.lab
			}
449
450
		} else if (fd.getType().equals(FieldDescriptor.Type.ENUM)) {
451 33382 claudio.at
			if (fd.getFullName().equals("eu.dnetlib.data.proto.OafEntity.type")) { return; }
452 28094 claudio.at
			metadata.add(asXmlElement(fd.getName(), ((EnumValueDescriptor) o).getName(), null, null));
453 26600 sandro.lab
		} else {
454 28094 claudio.at
			metadata.add(asXmlElement(fd.getName(), o.toString(), null, null));
455 26600 sandro.lab
		}
456
	}
457
458 28226 claudio.at
	private StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo dataInfo) {
459 28094 claudio.at
		sb.append(" inferred=\"" + dataInfo.getInferred() + "\"");
460
		sb.append(" inferenceprovenance=\"" + dataInfo.getInferenceprovenance() + "\"");
461 30863 claudio.at
		sb.append(" provenanceaction=\"" + dataInfo.getProvenanceaction().getClassid() + "\"");
462 28094 claudio.at
		sb.append(" trust=\"" + dataInfo.getTrust() + "\" ");
463
		return sb;
464 26600 sandro.lab
	}
465
466
	private List<String> buildContexts(final Type type) {
467
		final List<String> res = Lists.newArrayList();
468
469 33382 claudio.at
		if (contextMapper != null && !contextMapper.isEmpty() && type.equals(Type.result)) {
470 26600 sandro.lab
471
			XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");
472
473
			for (String id : contextes) {
474
475
				StringTokenizer st = new StringTokenizer(id, "::");
476
				String token = "";
477
				while (st.hasMoreTokens()) {
478
					token += st.nextToken();
479
480
					final ContextDef def = contextMapper.get(token);
481
482 33382 claudio.at
					if (def == null) { throw new IllegalStateException("cannot find context for id: " + token); }
483 26600 sandro.lab
484
					if (def.getName().equals("context")) {
485
						String xpath = "//context/@id='" + def.getId() + "'";
486
						if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) {
487
							document = addContextDef(document.gotoRoot(), def);
488
						}
489
					}
490
491
					if (def.getName().equals("category")) {
492
						String rootId = StringUtils.substringBefore(def.getId(), "::");
493
						document = addContextDef(document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), def);
494
					}
495
496
					if (def.getName().equals("concept")) {
497
						document = addContextDef(document, def).gotoParent();
498
					}
499
					token += "::";
500
				}
501
			}
502
503
			for (org.w3c.dom.Element x : document.gotoRoot().getChildElement()) {
504
				try {
505
					res.add(asStringElement(x));
506
				} catch (TransformerException e) {
507
					throw new RuntimeException(e);
508
				}
509
			}
510
		}
511
512
		return res;
513
	}
514
515
	private XMLTag addContextDef(final XMLTag tag, final ContextDef def) {
516
		tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel());
517 33382 claudio.at
		if (def.getType() != null && !def.getType().isEmpty()) {
518 26600 sandro.lab
			tag.addAttribute("type", def.getType());
519
		}
520
		return tag;
521
	}
522
523
	private String asStringElement(final org.w3c.dom.Element element) throws TransformerException {
524
		StringWriter buffer = new StringWriter();
525
		transformer.transform(new DOMSource(element), new StreamResult(buffer));
526
		return buffer.toString();
527
	}
528
529
	@SuppressWarnings("unchecked")
530
	private void handleFundingTree(final List<String> metadata, final FieldDescriptor fd, final Object o, final boolean expandingRel) {
531 28094 claudio.at
		String xmlTree = asXmlJSon(fd.getName(), o instanceof StringField ? ((StringField) o).getValue() : o.toString());
532 26600 sandro.lab
		if (expandingRel) {
533
			try {
534
				Document ftree = new SAXReader().read(new StringReader(xmlTree));
535
536
				int i = 0;
537
				String funding = "<funding>";
538
				String _id = "";
539
540
				for (Object id : Lists.reverse(ftree.selectNodes("//fundingtree//name"))) {
541
					_id += ((Element) id).getText();
542
					funding += "<funding_level_" + i + ">" + escapeXml(_id) + "</funding_level_" + i + ">";
543
					_id += "::";
544
					i++;
545
				}
546
				funding += "</funding>";
547
				// System.out.println("-------------------------------\n" + xmlTree + "\n" + funding);
548
				metadata.add(funding);
549
			} catch (DocumentException e) {
550
				System.err.println("unable to parse funding tree: " + xmlTree + "\n" + e.getMessage());
551
			}
552
		} else {
553
			metadata.add(xmlTree);
554
		}
555
	}
556
557
	private String asXmlJSon(final String root, final String json) {
558
		try {
559 33382 claudio.at
			if (json == null || json.isEmpty()) { return "<" + root + "/>"; }
560 26600 sandro.lab
			JSONObject o = new JSONObject(json.replace("'", ""));
561
562
			String contextId = parseFundingJson(o).toLowerCase();
563
			contextes.add(contextId);
564
565
			String xml = org.json.XML.toString(o, root);
566
			return xml;
567
		} catch (Exception e) {
568
			System.err.println("unable to parse json: " + json + "\n" + e.getMessage());
569
			return "<" + root + "/>";
570
		}
571
	}
572
573
	private String parseFundingJson(final JSONObject o) {
574
		try {
575
			String key = (String) Iterators.getOnlyElement(o.keys());
576
			JSONObject obj = o.getJSONObject(key);
577
578
			String id = obj.getString("id").toLowerCase();
579
			if (id.startsWith("welcometrust::")) {
580
				id = StringUtils.substringBeforeLast("uk::" + id.replace("welcometrust", "wt"), "::") + "::" + cleanup(id);
581
			} else if (id.startsWith("wt::wt")) {
582
				id = StringUtils.substringBeforeLast(id.replaceFirst("wt", "uk"), "::") + "::" + cleanup(id);
583
			} else if (id.startsWith("corda_______::")) {
584
				id = id.replace("corda_______::", "ec::");
585 33382 claudio.at
			} else if (id.startsWith("fct_________::")) {
586
				id = "pt::" + id.replace("fct_________", "fct");
587
				if (id.endsWith("::fct")) {
588
					id = StringUtils.substringBeforeLast(id, "::fct");
589
				}
590 26600 sandro.lab
			}
591
592
			String label = obj.getString("name");
593
594
			if (key.endsWith("level_0")) {
595
596
				if (id.equals("uk::wt")) {
597
					label = "Wellcome Trust Funding Stream";
598
				}
599
				contextMapper.put(id, new ContextDef(id, label, "category", ""));
600
601
				if (id.startsWith("ec::")) {
602
					contextMapper.put("ec", new ContextDef("ec", "European Community", "context", "funding"));
603
				} else if (id.startsWith("uk::")) {
604
					contextMapper.put("uk", new ContextDef("uk", "United Kingdom", "context", "funding"));
605 33382 claudio.at
				} else if (id.startsWith("pt::")) {
606
					contextMapper.put("pt", new ContextDef("pt", "Portugal", "context", "funding"));
607 26600 sandro.lab
				}
608
			} else {
609
				contextMapper.put(id, new ContextDef(id, label, "concept", ""));
610
				parseFundingJson(obj.getJSONObject("parent"));
611
			}
612
613
			return id;
614
		} catch (JSONException e) {
615
			throw new RuntimeException(e);
616
		}
617
	}
618
619
	private String cleanup(final String id) {
620
		return StringUtils.substring(StringUtils.deleteWhitespace(StringUtils.substringAfterLast(id, "::").replaceAll("[^a-zA-Z]", "")), 0, 20);
621
	}
622
623 28094 claudio.at
	private String asXmlElement(final String name, final String value, final Qualifier q, final DataInfo dataInfo) {
624 26600 sandro.lab
		StringBuilder sb = new StringBuilder();
625
		sb.append("<");
626
		sb.append(name);
627
		if (q != null) {
628
			sb.append(getAttributes(q));
629
		}
630 28094 claudio.at
		if (dataInfo != null) {
631
			sb = dataInfoAsAttributes(sb, dataInfo);
632
		}
633 33382 claudio.at
		if (value == null || value.isEmpty()) {
634 26600 sandro.lab
			sb.append("/>");
635
			return sb.toString();
636
			// return "<" + name + getAttributes(q) + "/>";
637
		}
638
639
		sb.append(">");
640
		// sb.append(escapeXml(Normalizer.normalize(value, Normalizer.Form.NFD)));
641
		sb.append(escapeXml(value));
642
		sb.append("</");
643
		sb.append(name);
644
		sb.append(">");
645
646
		return sb.toString();
647
		// return "<" + name + getAttributes(q) + ">" + escapeXml(value) + "</" + name + ">";
648
	}
649
650
	private String getAttributes(final Qualifier q) {
651 33382 claudio.at
		if (q == null) { return ""; }
652 26600 sandro.lab
653
		StringBuilder sb = new StringBuilder();
654
		for (Entry<FieldDescriptor, Object> e : q.getAllFields().entrySet()) {
655
			// sb.append(" " + e.getKey().getName() + "=\"" + escapeXml(e.getValue().toString()) + "\"");
656
			sb.append(" ");
657
			sb.append(e.getKey().getName());
658
			sb.append("=\"");
659
			sb.append(escapeXml(e.getValue().toString()));
660
			sb.append("\"");
661
		}
662
		return sb.toString();
663
	}
664
665
	private Set<Entry<FieldDescriptor, Object>> filterFields(final GeneratedMessage fields, final Set<String> filter) {
666
667
		if (filter != null) {
668
			Predicate<FieldDescriptor> p = new Predicate<FieldDescriptor>() {
669
670
				@Override
671
				public boolean apply(final FieldDescriptor descriptor) {
672 33382 claudio.at
					if (fields == null) {
673
					return false;
674
					}
675 28094 claudio.at
					String name = descriptor.getName();
676
					return filter.contains(name);
677 26600 sandro.lab
				}
678
			};
679
			Map<FieldDescriptor, Object> filtered = Maps.filterKeys(fields.getAllFields(), p);
680
			// System.out.println(
681
			// "filtered " + type.toString() + ": " + toString(filterEntries.keySet()) + "\n" +
682
			// "builder  " + fields.getDescriptorForType().getFullName() + ": " + toString(fields.getAllFields().keySet()));
683
			return filtered.entrySet();
684
		}
685
		return fields.getAllFields().entrySet();
686
	}
687
688
	public static String removePrefix(final String s) {
689 33382 claudio.at
		if (s.contains("|")) { return StringUtils.substringAfter(s, "|"); }
690 26600 sandro.lab
		return s;
691
	}
692
693
	public static String escapeXml(final String value) {
694
		// return StringEscapeUtils.escapeXml(value).replaceAll("&", "&amp;").replaceAll("<", "&lt;").replaceAll(">", "&gt;");
695
		return value.replaceAll("&", "&amp;").replaceAll("<", "&lt;").replaceAll(">", "&gt;");
696
	}
697
698 30968 claudio.at
	private List<String> countersAsXml() {
699
		List<String> out = Lists.newArrayList();
700
		for (Entry<String, Integer> e : counters.entrySet()) {
701
			out.add(String.format("<counter_%s value=\"%s\"/>", e.getKey(), e.getValue()));
702
		}
703
		return out;
704
	}
705
706
	private void incrementCounter(final String type) {
707
		if (!counters.containsKey(type)) {
708
			counters.put(type, 1);
709
		} else {
710
			counters.put(type, counters.get(type) + 1);
711
		}
712
	}
713
714 26600 sandro.lab
	@Override
715
	public String toString() {
716
		StringBuilder sb = new StringBuilder();
717
		sb.append("################################################\n");
718
		sb.append("ID: ").append(key).append("\n");
719
		if (mainEntity != null) {
720
			sb.append("MAIN ENTITY:\n").append(mainEntity.getEntity().toString() + "\n");
721
		}
722
		if (relations != null) {
723
			sb.append("\nRELATIONS:\n");
724 28226 claudio.at
			for (OafDecoder decoder : relations) {
725 26600 sandro.lab
				sb.append(decoder.getOafRel().toString() + "\n");
726
			}
727
		}
728
		if (children != null) {
729
			sb.append("\nCHILDREN:\n");
730 28226 claudio.at
			for (OafDecoder decoder : children) {
731 26600 sandro.lab
				sb.append(decoder.getOafRel().toString() + "\n");
732
			}
733
		}
734
		return sb.toString();
735
	}
736
737
}