1
|
package eu.dnetlib.iis.importer.converter;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.StringReader;
|
5
|
import java.util.ArrayList;
|
6
|
import java.util.Arrays;
|
7
|
import java.util.Collection;
|
8
|
import java.util.HashSet;
|
9
|
import java.util.List;
|
10
|
import java.util.Set;
|
11
|
|
12
|
import javax.xml.parsers.DocumentBuilder;
|
13
|
import javax.xml.parsers.DocumentBuilderFactory;
|
14
|
import javax.xml.parsers.ParserConfigurationException;
|
15
|
import javax.xml.xpath.XPath;
|
16
|
import javax.xml.xpath.XPathExpressionException;
|
17
|
import javax.xml.xpath.XPathFactory;
|
18
|
|
19
|
import org.apache.hadoop.hbase.client.Result;
|
20
|
import org.apache.log4j.Logger;
|
21
|
import org.json.simple.JSONObject;
|
22
|
import org.json.simple.parser.JSONParser;
|
23
|
import org.json.simple.parser.ParseException;
|
24
|
import org.w3c.dom.Document;
|
25
|
import org.xml.sax.InputSource;
|
26
|
import org.xml.sax.SAXException;
|
27
|
|
28
|
import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
|
29
|
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
30
|
import eu.dnetlib.iis.importer.input.approver.ResultApprover;
|
31
|
import eu.dnetlib.iis.importer.schemas.Project;
|
32
|
|
33
|
/**
|
34
|
* HBase {@link Result} to avro {@link Project} converter.
|
35
|
* @author mhorst
|
36
|
*
|
37
|
*/
|
38
|
public class ProjectConverter extends AbstractAvroConverter<Project> {
|
39
|
|
40
|
protected static final Logger log = Logger.getLogger(ProjectConverter.class);
|
41
|
|
42
|
private static final String ELEM_FUNDING_TREE_PARENT = "parent";
|
43
|
private static final String ELEM_FUNDING_TREE_NAME = "name";
|
44
|
|
45
|
private static final String FUNDER_FUNDING_SEPARATOR = "::";
|
46
|
|
47
|
private static final Set<String> ACRONYM_SKIP_LOWERCASED_VALUES = new HashSet<String>(
|
48
|
Arrays.asList("undefined", "unknown"));
|
49
|
|
50
|
/**
|
51
|
* Default constructor.
|
52
|
* @param encoding
|
53
|
* @param resultApprover
|
54
|
*/
|
55
|
public ProjectConverter(String encoding,
|
56
|
ResultApprover resultApprover) {
|
57
|
super(encoding, resultApprover);
|
58
|
}
|
59
|
|
60
|
@Override
|
61
|
public Project buildObject(Result source, Oaf resolvedOafObject) throws IOException {
|
62
|
eu.dnetlib.data.proto.ProjectProtos.Project sourceProject = resolvedOafObject.getEntity()!=null?
|
63
|
resolvedOafObject.getEntity().getProject():null;
|
64
|
if (sourceProject==null) {
|
65
|
log.error("skipping: no project object " +
|
66
|
"for a row " + new String(source.getRow(), getEncoding()));
|
67
|
return null;
|
68
|
}
|
69
|
if (resolvedOafObject.getEntity().getId()!=null &&
|
70
|
!resolvedOafObject.getEntity().getId().isEmpty()) {
|
71
|
Project.Builder builder = Project.newBuilder();
|
72
|
builder.setId(resolvedOafObject.getEntity().getId());
|
73
|
if (sourceProject.getMetadata()!=null) {
|
74
|
if (isAcronymValid(sourceProject.getMetadata().getAcronym())) {
|
75
|
builder.setProjectAcronym(sourceProject.getMetadata().getAcronym().getValue());
|
76
|
}
|
77
|
if (sourceProject.getMetadata().getCode()!=null &&
|
78
|
sourceProject.getMetadata().getCode().getValue()!=null &&
|
79
|
!sourceProject.getMetadata().getCode().getValue().isEmpty()) {
|
80
|
builder.setProjectGrantId(sourceProject.getMetadata().getCode().getValue());
|
81
|
}
|
82
|
String extractedFundingClass = extractFundingClass(
|
83
|
extractStringValues(sourceProject.getMetadata().getFundingtreeList()));
|
84
|
if (extractedFundingClass!=null && !extractedFundingClass.isEmpty()) {
|
85
|
builder.setFundingClass(extractedFundingClass);
|
86
|
}
|
87
|
}
|
88
|
return builder.build();
|
89
|
} else {
|
90
|
log.warn("unable to extract grant number: " +
|
91
|
"unsupported project id: " + resolvedOafObject.getEntity().getId());
|
92
|
return null;
|
93
|
}
|
94
|
}
|
95
|
|
96
|
/**
|
97
|
* Extracts string values from {@link StringField} list.
|
98
|
* @param source
|
99
|
* @return string values extracted from {@link StringField} list
|
100
|
*/
|
101
|
protected static List<String> extractStringValues(List<StringField> source) {
|
102
|
if (source!=null) {
|
103
|
List<String> results = new ArrayList<String>(source.size());
|
104
|
for (StringField currentField : source) {
|
105
|
results.add(currentField.getValue());
|
106
|
}
|
107
|
return results;
|
108
|
} else {
|
109
|
return null;
|
110
|
}
|
111
|
}
|
112
|
|
113
|
/**
|
114
|
* Verifies whether acronym should be considered as valid.
|
115
|
* @param acronym
|
116
|
* @return true if valid, false otherwise
|
117
|
*/
|
118
|
private static boolean isAcronymValid(StringField acronym) {
|
119
|
return acronym!=null && isAcronymValid(acronym.getValue());
|
120
|
}
|
121
|
|
122
|
/**
|
123
|
* Verifies whether acronym should be considered as valid.
|
124
|
* @param acronym
|
125
|
* @return true if valid, false otherwise
|
126
|
*/
|
127
|
public static boolean isAcronymValid(String acronym) {
|
128
|
return acronym!=null && !acronym.isEmpty() &&
|
129
|
!ACRONYM_SKIP_LOWERCASED_VALUES.contains(acronym.trim().toLowerCase());
|
130
|
}
|
131
|
|
132
|
/**
|
133
|
* Extracts funding class from funding tree defined as XML.
|
134
|
* @param fundingTreeXML
|
135
|
* @return extracted funding class
|
136
|
* @throws IOException
|
137
|
*/
|
138
|
public static String extractFundingClassFromXML(Collection<String> fundingTreeXMLList) throws IOException {
|
139
|
if (fundingTreeXMLList!=null && fundingTreeXMLList.size()>0) {
|
140
|
for (String fundingTreeXML : fundingTreeXMLList) {
|
141
|
if (fundingTreeXML!=null && !fundingTreeXML.isEmpty()) {
|
142
|
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
|
143
|
try {
|
144
|
DocumentBuilder builder = builderFactory.newDocumentBuilder();
|
145
|
Document xmlDocument = builder.parse(new InputSource(new StringReader(fundingTreeXML)));
|
146
|
XPath xPath = XPathFactory.newInstance().newXPath();
|
147
|
StringBuilder strBuilder = new StringBuilder();
|
148
|
strBuilder.append(xPath.compile("//funder/shortname").evaluate(xmlDocument));
|
149
|
strBuilder.append(FUNDER_FUNDING_SEPARATOR);
|
150
|
strBuilder.append(xPath.compile("//funding_level_0/name").evaluate(xmlDocument));
|
151
|
return strBuilder.toString();
|
152
|
} catch (ParserConfigurationException e) {
|
153
|
throw new IOException("exception occurred when processing xml: " + fundingTreeXML, e);
|
154
|
} catch (SAXException e) {
|
155
|
throw new IOException("exception occurred when processing xml: " + fundingTreeXML, e);
|
156
|
} catch (XPathExpressionException e) {
|
157
|
throw new IOException("exception occurred when processing xml: " + fundingTreeXML, e);
|
158
|
}
|
159
|
}
|
160
|
}
|
161
|
}
|
162
|
// fallback
|
163
|
return null;
|
164
|
}
|
165
|
|
166
|
/**
|
167
|
* Extracts funding class from funding tree.
|
168
|
* @param fundingTreeList
|
169
|
* @return extracted funding class
|
170
|
* @throws IOException
|
171
|
*/
|
172
|
public static String extractFundingClass(List<String> fundingTreeList) throws IOException {
|
173
|
return extractFundingClassFromJSON(fundingTreeList);
|
174
|
// return extractFundingClassFromXML(fundingTreeList);
|
175
|
}
|
176
|
|
177
|
/**
|
178
|
* Extracts funding class from funding tree defined as JSON.
|
179
|
* @param fundingTreeJsonList
|
180
|
* @return extracted funding class
|
181
|
* @throws IOException
|
182
|
*/
|
183
|
public static String extractFundingClassFromJSON(List<String> fundingTreeJsonList) throws IOException {
|
184
|
if (fundingTreeJsonList!=null && !fundingTreeJsonList.isEmpty()) {
|
185
|
for (String currentFundingTreeJson : fundingTreeJsonList) {
|
186
|
if (currentFundingTreeJson!=null && !currentFundingTreeJson.isEmpty()) {
|
187
|
try {
|
188
|
JSONParser parser = new JSONParser();
|
189
|
JSONObject topLevelParent = getTopLevelParent(
|
190
|
(JSONObject) parser.parse(currentFundingTreeJson));
|
191
|
if (topLevelParent!=null) {
|
192
|
@SuppressWarnings("unchecked")
|
193
|
Collection<JSONObject> topLevelParentValues = topLevelParent.values();
|
194
|
for (JSONObject currentValue : topLevelParentValues) {
|
195
|
Object currentName = currentValue.get(ELEM_FUNDING_TREE_NAME);
|
196
|
if (currentName!=null) {
|
197
|
return currentName.toString();
|
198
|
}
|
199
|
}
|
200
|
// fallback
|
201
|
return null;
|
202
|
} else {
|
203
|
return null;
|
204
|
}
|
205
|
|
206
|
} catch (ParseException e) {
|
207
|
throw new IOException("unable to parse funding tree: " +
|
208
|
currentFundingTreeJson, e);
|
209
|
}
|
210
|
}
|
211
|
}
|
212
|
// fallback
|
213
|
return null;
|
214
|
} else {
|
215
|
return null;
|
216
|
}
|
217
|
}
|
218
|
|
219
|
private static JSONObject getTopLevelParent(JSONObject parent) {
|
220
|
if (parent!=null) {
|
221
|
@SuppressWarnings("unchecked")
|
222
|
Collection<JSONObject> values = parent.values();
|
223
|
for (JSONObject value : values) {
|
224
|
JSONObject newParent = (JSONObject) value.get(ELEM_FUNDING_TREE_PARENT);
|
225
|
if (newParent!=null && !newParent.isEmpty()) {
|
226
|
return getTopLevelParent(newParent);
|
227
|
}
|
228
|
}
|
229
|
// fallback
|
230
|
return parent;
|
231
|
} else {
|
232
|
return null;
|
233
|
}
|
234
|
|
235
|
}
|
236
|
|
237
|
}
|