Project

General

Profile

1
package prototype;
2

    
3
import java.nio.charset.Charset;
4
import java.nio.charset.StandardCharsets;
5
import java.text.Normalizer;
6
import java.util.List;
7
import java.util.Set;
8

    
9
import prototype.utils.Capitalize;
10
import prototype.utils.DotAbbreviations;
11

    
12
import com.google.common.base.Joiner;
13
import com.google.common.base.Splitter;
14
import com.google.common.collect.Iterables;
15
import com.google.common.collect.Lists;
16
import com.google.common.hash.Hashing;
17

    
18
//import eu.dnetlib.pace.clustering.NGramUtils;
19
//import eu.dnetlib.pace.util.Capitalise;
20
//import eu.dnetlib.pace.util.DotAbbreviations;
21

    
22
public class Person {
23
	private List<String> name = Lists.newArrayList();
24
	private List<String> surname = Lists.newArrayList();
25
	private List<String> fullname = Lists.newArrayList();
26

    
27
	private static Set<String> particles = null;
28

    
29
	public Person(String s) {
30
		s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
31
		s = s.replaceAll("\\(.+\\)", "");
32
		s = s.replaceAll("\\[.+\\]", "");
33
		s = s.replaceAll("\\{.+\\}", "");
34
		s = s.replaceAll("\\s+-\\s+", "-");
35
		
36

    
37
//		s = s.replaceAll("[\\W&&[^,-]]", " ");
38
		
39
//		System.out.println("class Person: s: " + s);
40

    
41
//		s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " ");
42
		s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
43
		s = s.replaceAll("\\d", " ");
44
		s = s.replaceAll("\\n", " ");
45
		s = s.replaceAll("\\.", " ");
46
		s = s.replaceAll("\\s+", " ");
47

    
48
		if (s.contains(",")) {
49
	//		System.out.println("class Person: s: " + s);
50

    
51
			String[] arr = s.split(",");
52
			if (arr.length == 1) {
53
				fullname = splitTerms(arr[0]);
54
			} else if (arr.length > 1) {
55
				surname = splitTerms(arr[0]);
56
				name = splitTermsFirstName(arr[1]);
57
//				System.out.println("class Person: surname: " + surname);
58
//				System.out.println("class Person: name: " + name);
59

    
60
				fullname.addAll(surname);
61
				fullname.addAll(name);
62
			}
63
		} else {
64
			fullname = splitTerms(s);
65

    
66
			int lastInitialPosition = fullname.size();
67
			boolean hasSurnameInUpperCase = false;
68

    
69
			for (int i = 0; i < fullname.size(); i++) {
70
				String term = fullname.get(i);
71
				if (term.length() == 1) {
72
					lastInitialPosition = i;
73
				} else if (term.equals(term.toUpperCase())) {
74
					hasSurnameInUpperCase = true;
75
				}
76
			}
77
			if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
78
				name = fullname.subList(0, lastInitialPosition + 1);
79
				System.out.println("name: " + name);
80
				surname = fullname.subList(lastInitialPosition + 1, fullname.size());
81
			} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
82
				for (String term : fullname) {
83
					if (term.length() > 1 && term.equals(term.toUpperCase())) {
84
						surname.add(term);
85
					} else {
86
						name.add(term);
87
					}
88
				}
89
			} else if (lastInitialPosition == fullname.size()){
90
				surname = fullname.subList(lastInitialPosition - 1, fullname.size());
91
				name = fullname.subList(0,  lastInitialPosition - 1);
92
			}
93
			
94
		}
95
	}
96
	
97
	private List<String> splitTermsFirstName(String s){
98
		List<String> list = Lists.newArrayList();
99
		for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
100
			if (s.trim().matches("\\p{Lu}{2,3}")){
101
				String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
102
				for (String p: parts){
103
					if (p.length() > 0)
104
						list.add(p);					
105
				}				
106
			}else{
107
				list.add(part);				
108
			}
109

    
110
		}
111
		return list;
112
	}
113

    
114
	private List<String> splitTerms(String s) {
115
		if (particles == null) {
116
//			particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
117
		}
118

    
119
		List<String> list = Lists.newArrayList();
120
		for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
121
	//		if (!particles.contains(part.toLowerCase())) {
122
			list.add(part);				
123

    
124
	//		}
125
		}
126
		return list;
127
	}
128

    
129
	public List<String> getName() {
130
		return name;
131
	}
132

    
133
	public List<String> getSurname() {
134
		return surname;
135
	}
136

    
137
	public List<String> getFullname() {
138
		return fullname;
139
	}
140
	
141
	public String hash() {
142
		return Hashing.murmur3_128().hashString(getNormalisedFullname(),StandardCharsets.UTF_8).toString();
143
	}
144
	
145
	public String getNormalisedFullname() {
146
		return isAccurate() ? 
147
				Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) : 
148
				Joiner.on(" ").join(fullname);
149
//				return isAccurate() ? 
150
//						Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) : 
151
//						Joiner.on(" ").join(fullname);
152
	}
153
	
154
	public List<String> getCapitalSurname() {
155
		return Lists.newArrayList(Iterables.transform(surname, new Capitalize() ));
156
	}
157
	
158
	public List<String> getNameWithAbbreviations() {
159
		return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations() ));
160
	}	
161

    
162
	public boolean isAccurate() {
163
		return (name != null && surname != null && !name.isEmpty() && !surname.isEmpty());
164
	}
165
}
(1-1/2)