Project

General

Profile

1
package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
2

    
3
import java.util.List;
4

    
5
import com.google.common.base.Splitter;
6
import org.apache.commons.lang.StringUtils;
7
import org.dom4j.Element;
8

    
9
/**
10
 * Created by claudio on 25/03/16.
11
 */
12
public class SubjectParser {
13

    
14
	public static final String REGEX_SUBJECT = "^(info:eu-repo)\\/(classification)\\/([a-zA-Z]*)\\/(.*)$";
15
	private static final int MIN_LENGTH = 5;
16

    
17
	public SubjectsMap parse(final org.dom4j.Document doc) {
18

    
19
		final List subjectNodes = doc.selectNodes("//*[local-name() = 'subject']");
20
		final SubjectsMap subjectMap = new SubjectsMap();
21

    
22
		for(int i = 0; i<subjectNodes.size(); i++) {
23
			final Element e = (Element) subjectNodes.get(i);
24
			final String subject = e.getText();
25

    
26
			final String type = guessType(subject);
27
			if (!subjectMap.containsKey(type)) {
28
				subjectMap.put(type, new Subjects());
29
			}
30

    
31
			if (StringUtils.isNotBlank(type)) {
32
				if ("keyword".equals(type)) {
33
					final Splitter splitter = Splitter.on(",").trimResults().omitEmptyStrings();
34
					for (String token : splitter.split(subject)) {
35
						final String value = token.replaceAll("[^a-zA-Z ]", "").toLowerCase();
36
						if (value.length() >= MIN_LENGTH) {
37
							subjectMap.get(type).add(value);
38
						}
39
					}
40
				} else {
41
					String token = subject.replaceFirst(REGEX_SUBJECT, "$4");
42

    
43
					if (StringUtils.isNotBlank(token)) {
44
						final String value = token.replaceAll("[^a-zA-Z ]", "").toLowerCase();
45
						if (value.length() >= MIN_LENGTH) {
46
							subjectMap.get(type).add(value);
47
						}
48
					}
49
				}
50
			}
51
		}
52

    
53
		return subjectMap;
54
	}
55

    
56
	private String guessType(final String subject) {
57
		if (subject.startsWith("info:eu-repo")) {
58
			final String s = subject.replaceAll(REGEX_SUBJECT, "$3");
59
			return s;
60
		} else {
61
			return "keyword";
62
		}
63
	}
64
}
(7-7/9)