1
|
package eu.dnetlib.data.transform;
|
2
|
|
3
|
import java.io.StringReader;
|
4
|
|
5
|
import org.apache.commons.codec.binary.Base64;
|
6
|
import org.apache.commons.lang.StringUtils;
|
7
|
import org.apache.solr.common.SolrInputDocument;
|
8
|
import org.dom4j.Document;
|
9
|
import org.dom4j.DocumentException;
|
10
|
import org.dom4j.Element;
|
11
|
import org.dom4j.io.SAXReader;
|
12
|
|
13
|
import com.google.common.base.Splitter;
|
14
|
import com.google.common.collect.Lists;
|
15
|
import com.google.protobuf.GeneratedMessage;
|
16
|
|
17
|
import eu.dnetlib.pace.config.Type;
|
18
|
|
19
|
/**
|
20
|
* The Class ProtoDocumentMapper.
|
21
|
*/
|
22
|
public class SolrProtoMapper extends AbstractProtoMapper {
|
23
|
|
24
|
private static final String ID_SEPARATOR = "::";
|
25
|
|
26
|
/** The fields. */
|
27
|
private Document fields;
|
28
|
|
29
|
/**
|
30
|
* Instantiates a new proto document mapper.
|
31
|
*
|
32
|
* @param fields
|
33
|
* the fields
|
34
|
* @throws DocumentException
|
35
|
* the document exception
|
36
|
*/
|
37
|
public SolrProtoMapper(final String fields) throws DocumentException {
|
38
|
this.fields = parse(fields);
|
39
|
|
40
|
if (StringUtils.isBlank(this.fields.valueOf("//FIELD[@name = 'objIdentifier']/@name")))
|
41
|
throw new IllegalArgumentException("field objIdentifier is mandatory");
|
42
|
}
|
43
|
|
44
|
/**
|
45
|
* Map.
|
46
|
*
|
47
|
* @param proto
|
48
|
* the proto
|
49
|
* @param version
|
50
|
* the version
|
51
|
* @param dsId
|
52
|
* the ds id
|
53
|
* @return the solr input document
|
54
|
* @throws DocumentException
|
55
|
* the document exception
|
56
|
*/
|
57
|
public SolrInputDocument map(final GeneratedMessage proto, final String version, final String dsId, final String actionSetId) throws DocumentException {
|
58
|
|
59
|
final SolrInputDocument doc = new SolrInputDocument();
|
60
|
|
61
|
for (final Object o : fields.selectNodes("//FIELD[string(@path)]")) {
|
62
|
final Element e = (Element) o;
|
63
|
|
64
|
final String name = e.attribute("name").getValue().toLowerCase().trim();
|
65
|
final String path = e.attribute("path").getValue();
|
66
|
|
67
|
doc.setField(name, processMultiPath(proto, Lists.newLinkedList(Splitter.on("|").trimResults().split(path)), Type.String));
|
68
|
}
|
69
|
|
70
|
final String objIdentifier = patchId((String) doc.getFieldValue("objidentifier"));
|
71
|
doc.setField("objidentifier", objIdentifier);
|
72
|
doc.setField("__indexrecordidentifier", getRecordId(objIdentifier, actionSetId));
|
73
|
doc.setField("__dsid", dsId);
|
74
|
doc.setField("__dsversion", version);
|
75
|
doc.setField("__result", Base64.encodeBase64String(proto.toByteArray()));
|
76
|
doc.setField("actionset", actionSetId);
|
77
|
|
78
|
return doc;
|
79
|
}
|
80
|
|
81
|
public String getRecordId(final String objIdentifier, final String actionSetId) {
|
82
|
return objIdentifier + ID_SEPARATOR + actionSetId;
|
83
|
}
|
84
|
|
85
|
/**
|
86
|
* Patch the objidentifier: when it comes from HBase, i.e. contains the separator '|' returns the string that follows.
|
87
|
*
|
88
|
* @param objidentifier
|
89
|
* the objidentifier
|
90
|
* @return the string
|
91
|
*/
|
92
|
private String patchId(final String objidentifier) {
|
93
|
return objidentifier.contains("|") ? StringUtils.substringAfter(objidentifier, "|") : objidentifier;
|
94
|
}
|
95
|
|
96
|
/**
|
97
|
* Parses the.
|
98
|
*
|
99
|
* @param s
|
100
|
* the s
|
101
|
* @return the document
|
102
|
* @throws DocumentException
|
103
|
* the document exception
|
104
|
*/
|
105
|
private Document parse(final String s) throws DocumentException {
|
106
|
return new SAXReader().read(new StringReader(s));
|
107
|
}
|
108
|
|
109
|
}
|