Project

General

Profile

1
package eu.dnetlib.pace.model;
2

    
3
import java.nio.charset.Charset;
4
import java.text.Normalizer;
5
import java.util.List;
6
import java.util.Set;
7

    
8
import com.google.common.base.Joiner;
9
import com.google.common.base.Splitter;
10
import com.google.common.collect.Iterables;
11
import com.google.common.collect.Lists;
12
import com.google.common.hash.Hashing;
13

    
14
import eu.dnetlib.pace.common.AbstractPaceFunctions;
15
import eu.dnetlib.pace.util.Capitalise;
16
import eu.dnetlib.pace.util.DotAbbreviations;
17

    
18
public class Person {
19

    
20
	private static final String UTF8 = "UTF-8";
21
	private List<String> name = Lists.newArrayList();
22
	private List<String> surname = Lists.newArrayList();
23
	private List<String> fullname = Lists.newArrayList();
24
	private final String original;
25

    
26
	private static Set<String> particles = null;
27

    
28
	public Person(String s, final boolean aggressive) {
29
		original = s;
30
		s = Normalizer.normalize(s, Normalizer.Form.NFD);
31
		s = s.replaceAll("\\(.+\\)", "");
32
		s = s.replaceAll("\\[.+\\]", "");
33
		s = s.replaceAll("\\{.+\\}", "");
34
		s = s.replaceAll("\\s+-\\s+", "-");
35
		s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
36
		s = s.replaceAll("\\d", " ");
37
		s = s.replaceAll("\\n", " ");
38
		s = s.replaceAll("\\.", " ");
39
		s = s.replaceAll("\\s+", " ");
40

    
41
		if (aggressive) {
42
			s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
43
			// s = s.replaceAll("[\\W&&[^,-]]", "");
44
		}
45

    
46
		if (s.contains(",")) {
47
			final String[] arr = s.split(",");
48
			if (arr.length == 1) {
49
				fullname = splitTerms(arr[0]);
50
			} else if (arr.length > 1) {
51
				surname = splitTerms(arr[0]);
52
				name = splitTerms(arr[1]);
53
				fullname.addAll(surname);
54
				fullname.addAll(name);
55
			}
56
		} else {
57
			fullname = splitTerms(s);
58

    
59
			int lastInitialPosition = fullname.size();
60
			boolean hasSurnameInUpperCase = false;
61

    
62
			for (int i = 0; i < fullname.size(); i++) {
63
				final String term = fullname.get(i);
64
				if (term.length() == 1) {
65
					lastInitialPosition = i;
66
				} else if (term.equals(term.toUpperCase())) {
67
					hasSurnameInUpperCase = true;
68
				}
69
			}
70

    
71
			if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
72
				name = fullname.subList(0, lastInitialPosition + 1);
73
				surname = fullname.subList(lastInitialPosition + 1, fullname.size());
74
			} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
75
				for (final String term : fullname) {
76
					if ((term.length() > 1) && term.equals(term.toUpperCase())) {
77
						surname.add(term);
78
					} else {
79
						name.add(term);
80
					}
81
				}
82
			}
83
		}
84
	}
85

    
86
	private List<String> splitTerms(final String s) {
87
		if (particles == null) {
88
			particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
89
		}
90

    
91
		final List<String> list = Lists.newArrayList();
92
		for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
93
			if (!particles.contains(part.toLowerCase())) {
94
				list.add(part);
95
			}
96
		}
97
		return list;
98
	}
99

    
100
	public List<String> getName() {
101
		return name;
102
	}
103

    
104
	public String getNameString() {
105
		return Joiner.on(" ").join(getName());
106
	}
107

    
108
	public List<String> getSurname() {
109
		return surname;
110
	}
111

    
112
	public List<String> getFullname() {
113
		return fullname;
114
	}
115

    
116
	public String getOriginal() {
117
		return original;
118
	}
119

    
120
	public String hash() {
121
		return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
122
	}
123

    
124
	public String getNormalisedFirstName() {
125
		return Joiner.on(" ").join(getCapitalFirstnames());
126
	}
127

    
128
	public String getNormalisedSurname() {
129
		return Joiner.on(" ").join(getCapitalSurname());
130
	}
131

    
132
	public String getSurnameString() {
133
		return Joiner.on(" ").join(getSurname());
134
	}
135

    
136
	public String getNormalisedFullname() {
137
		return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
138
	}
139

    
140
	public List<String> getCapitalFirstnames() {
141
		return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise()));
142
	}
143

    
144
	public List<String> getCapitalSurname() {
145
		return Lists.newArrayList(Iterables.transform(surname, new Capitalise()));
146
	}
147

    
148
	public List<String> getNameWithAbbreviations() {
149
		return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations()));
150
	}
151

    
152
	public boolean isAccurate() {
153
		return ((name != null) && (surname != null) && !name.isEmpty() && !surname.isEmpty());
154
	}
155
}
(14-14/15)