1
|
package eu.dnetlib.data.collective.transformation.engine.functions;
|
2
|
|
3
|
import java.util.HashSet;
|
4
|
import java.util.LinkedList;
|
5
|
import java.util.List;
|
6
|
import java.util.Set;
|
7
|
import java.util.regex.Matcher;
|
8
|
import java.util.regex.Pattern;
|
9
|
|
10
|
import javax.xml.parsers.ParserConfigurationException;
|
11
|
import javax.xml.xpath.XPath;
|
12
|
import javax.xml.xpath.XPathConstants;
|
13
|
import javax.xml.xpath.XPathExpressionException;
|
14
|
|
15
|
import org.apache.commons.logging.Log;
|
16
|
import org.apache.commons.logging.LogFactory;
|
17
|
import org.w3c.dom.Document;
|
18
|
import org.w3c.dom.DocumentFragment;
|
19
|
import org.w3c.dom.Element;
|
20
|
import org.w3c.dom.Node;
|
21
|
import org.w3c.dom.NodeList;
|
22
|
|
23
|
public class IdentifierExtract extends AbstractTransformationFunction{
|
24
|
public static final Log log = LogFactory.getLog(IdentifierExtract.class);
|
25
|
public static final String paramXpathExprJson = "xpathExprJson";
|
26
|
public static final String paramXpathExprInSource = "xpathExprInputSource";
|
27
|
public static final String paramRegExpr = "regExpr";
|
28
|
|
29
|
@Override
|
30
|
String execute() throws ProcessingException {
|
31
|
// TODO Auto-generated method stub
|
32
|
return null;
|
33
|
}
|
34
|
|
35
|
/**
|
36
|
* extract content matched by a regular expression pattern from a given node and return matched content as a node-list
|
37
|
* @param aXpathExprList
|
38
|
* @param aInput
|
39
|
* @param aRegExpression
|
40
|
* @param aDocument
|
41
|
* @param aXpath
|
42
|
* @return nodeList
|
43
|
* @throws ProcessingException
|
44
|
*/
|
45
|
public NodeList extract(List<String> aXpathExprList, Node aInput,
|
46
|
String aRegExpression, Document aDocument, XPath aXpath) throws ProcessingException {
|
47
|
|
48
|
log.debug("xpathExprList: " + aXpathExprList);
|
49
|
log.debug("regExpr: " + aRegExpression);
|
50
|
Set<String> identifierSet = new HashSet<String>();
|
51
|
|
52
|
// log.debug("node: length: " + aInput.getChildNodes().getLength());
|
53
|
log.debug("regular expression : " + aRegExpression);
|
54
|
Pattern p = Pattern.compile(aRegExpression);
|
55
|
try {
|
56
|
List<String> textList = extractText(aXpathExprList, aInput, aXpath);
|
57
|
for (String text: textList){
|
58
|
log.debug("text as input : " + text);
|
59
|
Matcher m = p.matcher(text);
|
60
|
while (m.find()){
|
61
|
log.debug("extracted identifier: " + m.group());
|
62
|
identifierSet.add(m.group());
|
63
|
}
|
64
|
}
|
65
|
return toNodeList(identifierSet, aDocument);
|
66
|
} catch (XPathExpressionException e) {
|
67
|
e.printStackTrace();
|
68
|
throw new ProcessingException(e);
|
69
|
} catch (ParserConfigurationException e) {
|
70
|
e.printStackTrace();
|
71
|
throw new ProcessingException(e);
|
72
|
}
|
73
|
}
|
74
|
|
75
|
/**
|
76
|
* create a list of nodes from a list of string values
|
77
|
* @param aValueSet, set of unique values
|
78
|
* @param aDocument
|
79
|
* @return nodeList
|
80
|
*/
|
81
|
private NodeList toNodeList(Set<String> aValueSet, Document aDocument){
|
82
|
DocumentFragment dFrag = aDocument.createDocumentFragment();
|
83
|
Element root = aDocument.createElement("root");
|
84
|
dFrag.appendChild(root);
|
85
|
for (String value: aValueSet){
|
86
|
Element eVal = aDocument.createElement("value");
|
87
|
eVal.setTextContent(value);
|
88
|
root.appendChild(eVal);
|
89
|
}
|
90
|
return dFrag.getChildNodes();
|
91
|
}
|
92
|
|
93
|
/**
|
94
|
* extract text from a given node using a list of given xpath expressions
|
95
|
* @param aXpathExprList
|
96
|
* @param aInput
|
97
|
* @param aXpath
|
98
|
* @return list of strings
|
99
|
* @throws XPathExpressionException
|
100
|
* @throws ParserConfigurationException
|
101
|
*/
|
102
|
private List<String> extractText(List<String> aXpathExprList, Node aInput, XPath aXpath) throws XPathExpressionException, ParserConfigurationException{
|
103
|
|
104
|
List<String> resultList = new LinkedList<String>();
|
105
|
for (String xpathExpr: aXpathExprList){
|
106
|
NodeList nodeList = (NodeList)aXpath.evaluate(xpathExpr, aInput, XPathConstants.NODESET);
|
107
|
log.debug("extract text: nodelist length: " + nodeList.getLength());
|
108
|
for (int i = 0; i < nodeList.getLength(); i++){
|
109
|
resultList.add(nodeList.item(i).getTextContent());
|
110
|
}
|
111
|
}
|
112
|
return resultList;
|
113
|
}
|
114
|
}
|