1
|
package eu.dnetlib.data.mapreduce.hbase.propagation.orcidthroughproducts;
|
2
|
|
3
|
import com.googlecode.protobuf.format.JsonFormat;
|
4
|
import eu.dnetlib.data.mapreduce.hbase.propagation.*;
|
5
|
import eu.dnetlib.data.proto.*;
|
6
|
import org.apache.hadoop.io.Text;
|
7
|
|
8
|
import java.util.ArrayList;
|
9
|
import java.util.Arrays;
|
10
|
import java.util.Iterator;
|
11
|
import java.util.List;
|
12
|
import java.util.stream.Collectors;
|
13
|
|
14
|
public class ResultOrcidIterator extends ResultIterator {
|
15
|
|
16
|
private Iterator<String> author_iterator;
|
17
|
private List<FieldTypeProtos.Author> autoritative_authors ;
|
18
|
private List<String> relatedResult ;
|
19
|
|
20
|
|
21
|
public ResultOrcidIterator(final Iterable<Text> values, final String key) throws NotValidResultSequenceException {
|
22
|
super(values,key);
|
23
|
}
|
24
|
|
25
|
@Override
|
26
|
protected void checkSequence() throws NotValidResultSequenceException {
|
27
|
if(!it.hasNext()){
|
28
|
throw new NotValidResultSequenceException("Empty information for key");
|
29
|
}
|
30
|
|
31
|
try {
|
32
|
autoritative_authors = new ArrayList<>();
|
33
|
relatedResult = new ArrayList<>();
|
34
|
analizeValueList();
|
35
|
|
36
|
}catch(JsonFormat.ParseException e){
|
37
|
throw new NotValidResultSequenceException("Problems recreating the author list from serialization");
|
38
|
}
|
39
|
|
40
|
List<FieldTypeProtos.Author> authors_with_orcid = autoritative_authors.stream()
|
41
|
.map(a -> {
|
42
|
if (a.getPidList() == null || a.getPidList().isEmpty())
|
43
|
return null;
|
44
|
return a;
|
45
|
})
|
46
|
.filter(a -> a!= null)
|
47
|
.filter(a -> containsOrcid(a.getPidList()))
|
48
|
.collect(Collectors.toList());
|
49
|
|
50
|
|
51
|
if(authors_with_orcid.size() == 0 || relatedResult.size() == 0){
|
52
|
resultId = TERMINATOR;
|
53
|
return;
|
54
|
}
|
55
|
|
56
|
|
57
|
author_iterator = relatedResult.iterator();
|
58
|
autoritative_authors = authors_with_orcid;
|
59
|
getNext();
|
60
|
|
61
|
}
|
62
|
|
63
|
private boolean containsOrcid(List<FieldTypeProtos.KeyValue> pidList){
|
64
|
if(pidList == null)
|
65
|
return false;
|
66
|
return pidList
|
67
|
.stream()
|
68
|
.filter(kv -> kv.getKey().equals(PropagationConstants.AUTHOR_PID))
|
69
|
.collect(Collectors.toList()).size() > 0;
|
70
|
}
|
71
|
|
72
|
private void getNext(){
|
73
|
if (author_iterator.hasNext())
|
74
|
resultId = author_iterator.next();
|
75
|
else
|
76
|
resultId = TERMINATOR;
|
77
|
}
|
78
|
|
79
|
@Override
|
80
|
public List<OafProtos.Oaf> next() {
|
81
|
//get the next merged author list
|
82
|
try {
|
83
|
//list of authors in the related result
|
84
|
Emit e = Emit.fromJson(resultId);
|
85
|
List<FieldTypeProtos.Author> author_list = getAuthorList(e);
|
86
|
|
87
|
ResultProtos.Result.Metadata.Builder metadata = searchMatch(author_list);
|
88
|
|
89
|
if (metadata != null){
|
90
|
ArrayList<OafProtos.Oaf> ret = new ArrayList<OafProtos.Oaf>(Arrays.asList(getUpdate(metadata, e.getId())));
|
91
|
getNext();
|
92
|
return ret;
|
93
|
}
|
94
|
|
95
|
|
96
|
}catch(JsonFormat.ParseException e){
|
97
|
|
98
|
}
|
99
|
getNext();
|
100
|
return null;
|
101
|
}
|
102
|
|
103
|
private ResultProtos.Result.Metadata.Builder searchMatch(List<FieldTypeProtos.Author> author_list){
|
104
|
ResultProtos.Result.Metadata.Builder metadataBuilder = ResultProtos.Result.Metadata.newBuilder();
|
105
|
boolean updated = false;
|
106
|
// for (FieldTypeProtos.Author a: autoritative_authors){
|
107
|
// searchAuthor(a,author_list);
|
108
|
// }
|
109
|
|
110
|
for (FieldTypeProtos.Author a: author_list){
|
111
|
FieldTypeProtos.Author.Builder author = searchAuthor(a, autoritative_authors);
|
112
|
if(author != null){
|
113
|
updated = true;
|
114
|
metadataBuilder.addAuthor(author);
|
115
|
}else{
|
116
|
metadataBuilder.addAuthor(FieldTypeProtos.Author.newBuilder(a));
|
117
|
}
|
118
|
}
|
119
|
if(updated)
|
120
|
return metadataBuilder;
|
121
|
return null;
|
122
|
}
|
123
|
|
124
|
|
125
|
private boolean equals(FieldTypeProtos.Author a1, FieldTypeProtos.Author a2){
|
126
|
if(a1.hasSurname()){
|
127
|
if(a2.hasSurname()){
|
128
|
if(!a1.getSurname().trim().equalsIgnoreCase(a2.getSurname().trim())){
|
129
|
return false;
|
130
|
}
|
131
|
//have the same surname. Check the name
|
132
|
if(a1.hasName()){
|
133
|
if (a2.hasName()){
|
134
|
if (a1.getName().trim().equalsIgnoreCase(a2.getName().trim())){
|
135
|
return true; //same name and same surname in a related research result
|
136
|
}
|
137
|
//they could be differently written (i.e. only the initials of the name in one of the two
|
138
|
return (a1.getName().trim().substring(0,0).equalsIgnoreCase(a2.getName().trim().substring(0,0)));
|
139
|
}
|
140
|
}
|
141
|
}
|
142
|
}
|
143
|
// if(a1.hasFullname()){
|
144
|
// if (a2.hasFullname()){
|
145
|
// if (a1.getFullname().trim().equalsIgnoreCase(a2.getFullname().trim())){
|
146
|
// return true;
|
147
|
// }
|
148
|
// //split string containing name and surname
|
149
|
// String[] ns_a1 = a1.getFullname().trim().split(" ");
|
150
|
// String[] ns_a2 = a2.getFullname().trim().split(" ");
|
151
|
//
|
152
|
//
|
153
|
// if (ns_a1[0].endsWith(".") || ns_a1[0].endsWith(",")){
|
154
|
// ns_a1[0] = ns_a1[0].substring(0,ns_a1[0].length()-1);
|
155
|
// }
|
156
|
// if (ns_a1[1].endsWith(".") || ns_a1[1].endsWith(",")){
|
157
|
// ns_a1[1] = ns_a1[1].substring(0,ns_a1[1].length()-1);
|
158
|
// }
|
159
|
//
|
160
|
// if (ns_a2[0].endsWith(".") || ns_a2[0].endsWith(",")){
|
161
|
// ns_a2[0] = ns_a2[0].substring(0,ns_a2[0].length()-1);
|
162
|
// }
|
163
|
// if (ns_a2[1].endsWith(".") || ns_a2[1].endsWith(",")){
|
164
|
// ns_a2[1] = ns_a2[1].substring(0,ns_a2[1].length()-1);
|
165
|
// }
|
166
|
//
|
167
|
// if(ns_a1[0].compareTo(ns_a1[1]) < 0){
|
168
|
// String tmp = ns_a1[0];
|
169
|
// ns_a1[0] = ns_a1[1];
|
170
|
// ns_a1[1] = tmp;
|
171
|
// }
|
172
|
//
|
173
|
// if(ns_a2[0].compareTo(ns_a2[1]) < 0){
|
174
|
// String tmp = ns_a2[0];
|
175
|
// ns_a2[0] = ns_a2[1];
|
176
|
// ns_a2[1] = tmp;
|
177
|
//
|
178
|
// }
|
179
|
//
|
180
|
// if(ns_a1[0].equalsIgnoreCase(ns_a2[0])){
|
181
|
// if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){//same name and surname
|
182
|
// return true;
|
183
|
// }
|
184
|
// if(ns_a1[1].length() == 1 || ns_a2[1].length() == 1){
|
185
|
// return ns_a1[1].charAt(0) == ns_a2[1].charAt(0);//same surname and initial of the name
|
186
|
// }
|
187
|
// return false;
|
188
|
//
|
189
|
// }else{
|
190
|
// if(ns_a1[1].equalsIgnoreCase(ns_a2[1])){
|
191
|
// if(ns_a1[0].length() == 1 || ns_a2[0].length()==1)
|
192
|
// return ns_a1[0].charAt(0) == ns_a2[0].charAt(0);
|
193
|
// else
|
194
|
// return false;
|
195
|
// }
|
196
|
// }
|
197
|
//
|
198
|
//
|
199
|
//
|
200
|
// }
|
201
|
// return false;
|
202
|
// }
|
203
|
return false;
|
204
|
|
205
|
}
|
206
|
|
207
|
private FieldTypeProtos.Author.Builder searchAuthor(FieldTypeProtos.Author a, List<FieldTypeProtos.Author> author_list){
|
208
|
if(containsOrcid(a.getPidList()))
|
209
|
return null;
|
210
|
for(FieldTypeProtos.Author autoritative_author : author_list) {
|
211
|
if (equals(autoritative_author, a)) {
|
212
|
if(!containsOrcid(a.getPidList()))
|
213
|
return update(a, autoritative_author);
|
214
|
}
|
215
|
}
|
216
|
return null;
|
217
|
|
218
|
}
|
219
|
|
220
|
private void analizeValueList() throws JsonFormat.ParseException {
|
221
|
while(it.hasNext()){
|
222
|
Value v = Value.fromJson(it.next().toString());
|
223
|
|
224
|
if(v.getType().equals(PropagationConstants.Type.fromresult)){
|
225
|
autoritative_authors.addAll(getAuthorList(Emit.fromJson(v.getValue ())));
|
226
|
}
|
227
|
if(v.getType().equals(PropagationConstants.Type.fromsemrel)){
|
228
|
relatedResult.add(v.getValue());
|
229
|
}
|
230
|
}
|
231
|
|
232
|
}
|
233
|
private FieldTypeProtos.Author.Builder update(FieldTypeProtos.Author related_author, FieldTypeProtos.Author autoritative_autor ){
|
234
|
|
235
|
FieldTypeProtos.Author.Builder res = FieldTypeProtos.Author.newBuilder(related_author);
|
236
|
List<FieldTypeProtos.KeyValue> apid_list = autoritative_autor.getPidList();
|
237
|
FieldTypeProtos.KeyValue akv = apid_list.stream().filter(kv -> kv.getKey().equals(PropagationConstants.AUTHOR_PID)).collect(Collectors.toList()).get(0);
|
238
|
FieldTypeProtos.KeyValue.Builder kvb = FieldTypeProtos.KeyValue.newBuilder();
|
239
|
kvb.setKey(akv.getKey()).setValue(akv.getValue());
|
240
|
kvb.setDataInfo(Utils.getDataInfo(
|
241
|
PropagationConstants.ORCID_RESULT_TRUST,
|
242
|
PropagationConstants.CLASS_ORCID_ID,
|
243
|
PropagationConstants.SCHEMA_ID,
|
244
|
PropagationConstants.SCHEMA_NAME,
|
245
|
PropagationConstants.DATA_INFO_TYPE,
|
246
|
PropagationConstants.CLASS_ORCID_NAME)
|
247
|
);
|
248
|
return res.addPid(kvb);
|
249
|
|
250
|
|
251
|
}
|
252
|
|
253
|
private List<FieldTypeProtos.Author> getAuthorList(Emit e) throws JsonFormat.ParseException {
|
254
|
|
255
|
List<FieldTypeProtos.Author> authors = new ArrayList<>();
|
256
|
for (String author : e.getAuthor_list()) {
|
257
|
FieldTypeProtos.Author.Builder author_builder = FieldTypeProtos.Author.newBuilder();
|
258
|
JsonFormat.merge(author, author_builder);
|
259
|
authors.add(author_builder.build());
|
260
|
}
|
261
|
|
262
|
return authors;
|
263
|
|
264
|
}
|
265
|
|
266
|
|
267
|
public static OafProtos.Oaf getUpdate(ResultProtos.Result.Metadata.Builder metadata, String resultId) {
|
268
|
final ResultProtos.Result.Builder result = ResultProtos.Result.newBuilder().setMetadata(metadata);
|
269
|
final OafProtos.OafEntity.Builder entity = OafProtos.OafEntity.newBuilder()
|
270
|
.setType(TypeProtos.Type.result)
|
271
|
.setId(resultId)
|
272
|
.setResult(result);
|
273
|
|
274
|
return OafProtos.Oaf.newBuilder()
|
275
|
.setKind(KindProtos.Kind.entity)
|
276
|
.setEntity(entity)
|
277
|
.build();
|
278
|
}
|
279
|
}
|