Project

General

Profile

1
package eu.dnetlib.data.collective.transformation.engine.functions;
2

    
3
import java.util.HashSet;
4
import java.util.LinkedList;
5
import java.util.List;
6
import java.util.Set;
7
import java.util.regex.Matcher;
8
import java.util.regex.Pattern;
9

    
10
import javax.xml.parsers.ParserConfigurationException;
11
import javax.xml.xpath.XPath;
12
import javax.xml.xpath.XPathConstants;
13
import javax.xml.xpath.XPathExpressionException;
14

    
15
import org.apache.commons.logging.Log;
16
import org.apache.commons.logging.LogFactory;
17
import org.w3c.dom.Document;
18
import org.w3c.dom.DocumentFragment;
19
import org.w3c.dom.Element;
20
import org.w3c.dom.Node;
21
import org.w3c.dom.NodeList;
22

    
23
public class IdentifierExtract extends AbstractTransformationFunction{
24
	public static final Log log = LogFactory.getLog(IdentifierExtract.class);
25
	public static final String paramXpathExprJson = "xpathExprJson";
26
	public static final String paramXpathExprInSource = "xpathExprInputSource";
27
	public static final String paramRegExpr = "regExpr";
28
	
29
	@Override
30
	String execute() throws ProcessingException {
31
		// TODO Auto-generated method stub
32
		return null;
33
	}
34

    
35
	/**
36
	 * extract content matched by a regular expression pattern from a given node and return matched content as a node-list
37
	 * @param aXpathExprList
38
	 * @param aInput
39
	 * @param aRegExpression
40
	 * @param aDocument
41
	 * @param aXpath
42
	 * @return nodeList
43
	 * @throws ProcessingException
44
	 */
45
	public NodeList extract(List<String> aXpathExprList, Node aInput,
46
			String aRegExpression, Document aDocument, XPath aXpath) throws ProcessingException {
47
		
48
		log.debug("xpathExprList: " + aXpathExprList);
49
		log.debug("regExpr: " + aRegExpression);
50
		Set<String> identifierSet = new HashSet<String>();
51
		
52
//		log.debug("node: length: " + aInput.getChildNodes().getLength());
53
		log.debug("regular expression : " + aRegExpression);
54
		Pattern p = Pattern.compile(aRegExpression);
55
		try {
56
			List<String> textList = extractText(aXpathExprList, aInput, aXpath);
57
			for (String text: textList){
58
				log.debug("text as input : " + text);
59
				Matcher m = p.matcher(text);
60
				while (m.find()){
61
					log.debug("extracted identifier: " + m.group());
62
					identifierSet.add(m.group());
63
				}
64
			}
65
			return toNodeList(identifierSet, aDocument);
66
		} catch (XPathExpressionException e) {
67
			e.printStackTrace();
68
			throw new ProcessingException(e);
69
		} catch (ParserConfigurationException e) {
70
			e.printStackTrace();
71
			throw new ProcessingException(e);
72
		}
73
	}
74
	
75
	/**
76
	 * create a list of nodes from a list of string values
77
	 * @param aValueSet, set of unique values
78
	 * @param aDocument
79
	 * @return nodeList
80
	 */
81
	private NodeList toNodeList(Set<String> aValueSet, Document aDocument){
82
		DocumentFragment dFrag = aDocument.createDocumentFragment();
83
		Element root = aDocument.createElement("root");
84
		dFrag.appendChild(root);
85
		for (String value: aValueSet){
86
			Element eVal = aDocument.createElement("value");
87
			eVal.setTextContent(value);
88
			root.appendChild(eVal);
89
		}
90
		return dFrag.getChildNodes();
91
	}
92
	
93
	/**
94
	 * extract text from a given node using a list of given xpath expressions
95
	 * @param aXpathExprList
96
	 * @param aInput
97
	 * @param aXpath
98
	 * @return list of strings
99
	 * @throws XPathExpressionException
100
	 * @throws ParserConfigurationException 
101
	 */
102
	private List<String> extractText(List<String> aXpathExprList, Node aInput, XPath aXpath) throws XPathExpressionException, ParserConfigurationException{
103
		
104
		List<String> resultList = new LinkedList<String>();
105
		for (String xpathExpr: aXpathExprList){
106
			NodeList nodeList = (NodeList)aXpath.evaluate(xpathExpr, aInput, XPathConstants.NODESET);
107
			log.debug("extract text: nodelist length: " + nodeList.getLength()); 
108
			for (int i = 0; i < nodeList.getLength(); i++){
109
				resultList.add(nodeList.item(i).getTextContent());
110
			}
111
		}
112
		return resultList;
113
	}
114
}
(9-9/17)