Revision 55238
Added by Alessia Bardi over 4 years ago
modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/mapreduce/actions/DOIBoostToActionsTest.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.actions; |
2 | 2 |
|
3 |
import java.io.*; |
|
4 |
import java.util.List; |
|
5 |
import java.util.zip.DataFormatException; |
|
6 |
import java.util.zip.Inflater; |
|
7 |
|
|
3 | 8 |
import com.google.common.collect.Lists; |
4 | 9 |
import com.google.gson.JsonObject; |
5 | 10 |
import com.google.gson.JsonParser; |
6 | 11 |
import eu.dnetlib.actionmanager.actions.ActionFactory; |
7 | 12 |
import eu.dnetlib.actionmanager.actions.AtomicAction; |
8 | 13 |
import eu.dnetlib.actionmanager.common.Agent; |
14 |
import eu.dnetlib.data.mapreduce.hbase.Reporter; |
|
9 | 15 |
import eu.dnetlib.data.mapreduce.hbase.dataimport.DOIBoostToActions; |
10 | 16 |
import eu.dnetlib.data.transform.Column; |
11 | 17 |
import eu.dnetlib.data.transform.Row; |
... | ... | |
14 | 20 |
import org.junit.Before; |
15 | 21 |
import org.junit.Test; |
16 | 22 |
|
17 |
import java.io.*; |
|
18 |
import java.util.List; |
|
19 |
import java.util.zip.DataFormatException; |
|
20 |
import java.util.zip.Inflater; |
|
21 |
|
|
22 | 23 |
public class DOIBoostToActionsTest { |
23 | 24 |
private String setName; |
24 | 25 |
private Agent agent; |
26 |
private Reporter reporter; |
|
25 | 27 |
|
26 | 28 |
|
27 |
|
|
28 | 29 |
@Before |
29 | 30 |
public void setup() { |
30 | 31 |
setName = "DLI"; |
31 | 32 |
agent= new Agent("agentId","agentName", Agent.AGENT_TYPE.service); |
33 |
reporter = (Reporter) (counterGroup, counterName, delta) -> System.out.println(String.format("COUNTER: %s - %s : %d", counterGroup, counterName, delta)); |
|
32 | 34 |
} |
33 | 35 |
|
34 | 36 |
@Test |
35 | 37 |
public void testSingleDOIBoostAction() throws IOException { |
36 | 38 |
doTestSingleDOIBoostAction("/eu/dnetlib/data/mapreduce/actions/DOIBoostAction.json"); |
37 | 39 |
} |
40 |
@Test |
|
41 |
public void testSingleDOIBoostActionFilter() throws IOException { |
|
42 |
doTestSingleDOIBoostAction("/eu/dnetlib/data/mapreduce/actions/DOIBoostAction_filterOut.json"); |
|
43 |
} |
|
38 | 44 |
|
39 | 45 |
|
40 | 46 |
@Test |
... | ... | |
94 | 100 |
|
95 | 101 |
final JsonParser parser = new JsonParser(); |
96 | 102 |
JsonObject root = parser.parse(line).getAsJsonObject(); |
97 |
List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false); |
|
103 |
List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false,reporter); |
|
104 |
|
|
98 | 105 |
if (actions!= null) { |
99 | 106 |
actions.forEach(action-> { |
100 | 107 |
if (action.getTargetColumn().equals("body") && action.getTargetColumnFamily().equals("result")) |
... | ... | |
121 | 128 |
|
122 | 129 |
final JsonParser parser = new JsonParser(); |
123 | 130 |
JsonObject root = parser.parse(line).getAsJsonObject(); |
124 |
List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false); |
|
131 |
List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false, reporter);
|
|
125 | 132 |
if (actions!= null) { |
126 | 133 |
|
127 | 134 |
actions.forEach(it -> System.out.println(String.format(" RowKey:%s TargetColumnFamily:%s TargetColumn: %s", it.getTargetRowKey(), it.getTargetColumnFamily(), it.getTargetColumn()))); |
... | ... | |
141 | 148 |
final JsonParser parser = new JsonParser(); |
142 | 149 |
JsonObject root = parser.parse(line).getAsJsonObject(); |
143 | 150 |
try { |
144 |
List<AtomicAction> atomicActions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false); |
|
151 |
List<AtomicAction> atomicActions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false, reporter);
|
|
145 | 152 |
if (atomicActions!= null) |
146 | 153 |
{ |
147 | 154 |
i ++; |
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/actions/DOIBoostAction_filterOut.json | ||
---|---|---|
1 |
{"publisher": "Elsevier BV", "doi": "10.1002/(sici)1098-0997(1999)7:5<248::aid-idog8>3.0.co;2-v", "hostedByOpenAire": {"name": "Infectious Diseases in Obstetrics and Gynecology", "id": "doajarticles::1064-7449"}, "license": [{"url": "http://doi.wiley.com/10.1002/tdm_license_1.1", "content-version": "tdm", "\"delay-in-days": null, "date-time": "2015-09-01T00:00:00Z"}, {"url": "https://doi.org/10.1002/(sici)1098-0997(1999)7:5<248::aid-idog8>3.0.co;2-v", "provenance": "UnpayWall", "access-rights": "OPEN"}], "title": ["Pelvic inflammatory disease in the postmenopausal woman"], "issued": "1999-1-1", "abstract": [{"provenance": "MAG", "value": "eJx9kk1vFDEMhv+KjyBtR2JFC7vXtqBKLUVcOHsynhmjTBzFzqzm3+PsAuUAvSRx5I/Hr/3c/6Bg\nvNIRvtHKdAJckSP2kSCyUUGrhUASZIorB+A0RlwWNCkbDKyESv4JWdQWSpKxKkY4iRsd3JHylI7w\ndH/3+PDl/u+U5VJuLLLAu8PNDZj4fTh0DqI1mh7h62slFWoKsiyO9t/yD9b8ssRt4VCkZ4w7kNEo\ntf8gKdRSKBmc2Gaw2ousWBgTYK+BVGGU4oVZ0g4wDS3qEo6qEhiNhkus2EzF4XBKoqQd3HryWNUj\nvY9/wF3CXhMVXaSe1MAKvRTilJ2nMWds6C5ndDTjPx3qmRRzLpJLQwRecOI0gVoduMF98hG3Gbjk\nhTSL+/uzZelZjIOCzlLjABuTnwhRTmCz+87itosCWstEZbvI4koqD22y3q8rBBgdLGFbrBdRfift\nCc7ghpxo8Cml0XfQl4Ud7blXIzc+b4mCxA4+HPfvP17tr/e7X+vx5vbt+fWdI21Xj6y68xSh+wkk\nbARZ\n"}], "issn": [{"type": "print", "value": "1064-7449"}, {"type": "electronic", "value": "1098-0997"}], "doi-url": "http://dx.doi.org/10.1002/(sici)1098-0997(1999)7:5<248::aid-idog8>3.0.co;2-v", "instances": [{"url": "https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002%2F(SICI)1098-0997(1999)7:5%3C248::AID-IDOG8%3E3.0.CO;2-V", "provenance": "CrossRef", "access-rights": "UNKNOWN"}, {"url": "https://onlinelibrary.wiley.com/doi/full/10.1002/(SICI)1098-0997(1999)7:5%3C248::AID-IDOG8%3E3.0.CO;2-V", "provenance": "CrossRef", "access-rights": "UNKNOWN"}], "authors": [{"affiliations": [{"official-page": "http://www.musc.edu/", "provenance": "MAG", "value": "Medical University of South Carolina", "identifiers": [{"value": "http://en.wikipedia.org/wiki/Medical_University_of_South_Carolina", "schema": "wikpedia"}, {"value": "grid.259828.c", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/153297377", "schema": "URL"}]}], "given": "Addie", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2517302922", "schema": "URL"}], "fullname": "", "family": "Jackson"}, {"affiliations": [{"official-page": "http://www.musc.edu/", "provenance": "MAG", "value": "Medical University of South Carolina", "identifiers": [{"value": "http://en.wikipedia.org/wiki/Medical_University_of_South_Carolina", "schema": "wikpedia"}, {"value": "grid.259828.c", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/153297377", "schema": "URL"}]}], "given": "D.E.", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2163996951", "schema": "URL"}], "fullname": "D.E. Soper", "family": "Soper"}], "published-print": "1999-1-1", "collectedFrom": ["CrossRef", "MAG", "UnpayWall"], "accepted": null, "type": "journal-article", "published-online": null, "subject": ["Obstetrics and Gynaecology", "Infectious Diseases", "Dermatology"]} |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/Reporter.java | ||
---|---|---|
1 |
package eu.dnetlib.data.mapreduce.hbase; |
|
2 |
|
|
3 |
import java.io.Serializable; |
|
4 |
|
|
5 |
/** |
|
6 |
* Created by Alessia Bardi on 2019-04-08. |
|
7 |
* |
|
8 |
* @author Alessia Bardi |
|
9 |
*/ |
|
10 |
public interface Reporter extends Serializable { |
|
11 |
|
|
12 |
void incrementCounter(String counterGroup, String counterName, long delta); |
|
13 |
|
|
14 |
} |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostToActions.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.dataimport; |
2 | 2 |
|
3 |
import java.io.ByteArrayOutputStream; |
|
4 |
import java.io.IOException; |
|
5 |
import java.io.InputStream; |
|
6 |
import java.util.*; |
|
7 |
import java.util.concurrent.atomic.AtomicInteger; |
|
8 |
import java.util.function.Function; |
|
9 |
import java.util.stream.Collectors; |
|
10 |
import java.util.zip.Inflater; |
|
11 |
|
|
3 | 12 |
import com.google.gson.Gson; |
4 | 13 |
import com.google.gson.JsonElement; |
5 | 14 |
import com.google.gson.JsonObject; |
6 |
import com.googlecode.protobuf.format.JsonFormat; |
|
7 | 15 |
import eu.dnetlib.actionmanager.actions.ActionFactory; |
8 | 16 |
import eu.dnetlib.actionmanager.actions.AtomicAction; |
9 | 17 |
import eu.dnetlib.actionmanager.common.Agent; |
18 |
import eu.dnetlib.data.mapreduce.hbase.Reporter; |
|
10 | 19 |
import eu.dnetlib.data.mapreduce.util.StreamUtils; |
11 | 20 |
import eu.dnetlib.data.proto.*; |
12 | 21 |
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions; |
... | ... | |
15 | 24 |
import org.apache.commons.io.IOUtils; |
16 | 25 |
import org.apache.commons.lang3.StringUtils; |
17 | 26 |
|
18 |
import java.io.ByteArrayOutputStream; |
|
19 |
import java.io.IOException; |
|
20 |
import java.io.InputStream; |
|
21 |
import java.util.*; |
|
22 |
import java.util.concurrent.atomic.AtomicInteger; |
|
23 |
import java.util.function.Function; |
|
24 |
import java.util.stream.Collectors; |
|
25 |
import java.util.zip.Inflater; |
|
26 |
|
|
27 | 27 |
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*; |
28 | 28 |
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization; |
29 | 29 |
|
... | ... | |
89 | 89 |
} |
90 | 90 |
} |
91 | 91 |
|
92 |
protected static boolean isValid(final JsonObject rootElement, final Reporter context){ |
|
92 | 93 |
|
94 |
final String doi = getStringValue(rootElement, "doi"); |
|
95 |
if (doi == null) { |
|
96 |
context.incrementCounter("filtered","no_doi", 1); |
|
97 |
return false; |
|
98 |
} |
|
99 |
final String type = getStringValue(rootElement,"type"); |
|
100 |
if (!typologiesMapping.containsKey(type)){ |
|
101 |
context.incrementCounter("filtered","unknowntype_"+type, 1); |
|
102 |
return false; |
|
103 |
} |
|
104 |
// fixes #4360 (test publisher) |
|
105 |
final String publisher = getStringValue(rootElement, "publisher"); |
|
106 |
if(publisher.equalsIgnoreCase("Test accounts")){ |
|
107 |
context.incrementCounter("filtered","test_publisher", 1); |
|
108 |
return false; |
|
109 |
} |
|
93 | 110 |
|
111 |
List<JsonObject> authors = getArrayObjects(rootElement, "authors"); |
|
112 |
boolean hasAuthor = false; |
|
113 |
for(JsonObject author : authors){ |
|
114 |
final String given = getStringValue(author, "given"); |
|
115 |
final String family = getStringValue(author, "family"); |
|
116 |
final String fullname = getStringValue(author, "fullname"); |
|
117 |
if (StringUtils.isNotBlank(fullname) || (StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family))) { |
|
118 |
hasAuthor = true; |
|
119 |
} |
|
120 |
// fixes #4368 |
|
121 |
if(StringUtils.isNotBlank(given) && given.equalsIgnoreCase("Addie") && StringUtils.isNotBlank(family) && family.equalsIgnoreCase("Jackson") && publisher.equalsIgnoreCase("Elsevier BV")){ |
|
122 |
context.incrementCounter("filtered","addieJackson", 1); |
|
123 |
return false; |
|
124 |
} |
|
125 |
} |
|
126 |
if(!hasAuthor){ |
|
127 |
context.incrementCounter("filtered","no_authors", 1); |
|
128 |
return false; |
|
129 |
} |
|
130 |
// fixes #4360 |
|
131 |
if(getCleanedTitles(rootElement).isEmpty()){ |
|
132 |
context.incrementCounter("filtered","invalid_title", 1); |
|
133 |
return false; |
|
134 |
} |
|
94 | 135 |
|
136 |
return true; |
|
137 |
} |
|
138 |
|
|
139 |
private static List<String> getCleanedTitles(final JsonObject rootElement){ |
|
140 |
List<String> titles = getArrayValues(rootElement, "title"); |
|
141 |
return titles.stream().filter( t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList()); |
|
142 |
} |
|
143 |
|
|
144 |
|
|
95 | 145 |
public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement, final ActionFactory factory, final String setName, final Agent agent, boolean invisible, |
96 |
final boolean onlyOrganization) { |
|
146 |
final boolean onlyOrganization, final Reporter context) {
|
|
97 | 147 |
|
148 |
if(!isValid(rootElement, context)) return null; |
|
149 |
|
|
98 | 150 |
//Create OAF Proto |
99 | 151 |
|
100 | 152 |
final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder(); |
... | ... | |
135 | 187 |
} |
136 | 188 |
//Adding identifier |
137 | 189 |
final String doi = getStringValue(rootElement, "doi"); |
138 |
if (doi == null) |
|
139 |
return null; |
|
140 | 190 |
entity.addOriginalId(doi); |
191 |
|
|
141 | 192 |
final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi)); |
142 | 193 |
entity.setId(sourceId); |
143 | 194 |
|
... | ... | |
152 | 203 |
|
153 | 204 |
final String type = getStringValue(rootElement,"type"); |
154 | 205 |
|
155 |
if (!typologiesMapping.containsKey(type)) |
|
156 |
return null; |
|
157 |
|
|
158 | 206 |
//Adding Instances |
159 | 207 |
final String typeValue = typologiesMapping.get(type).get("value"); |
160 | 208 |
final String cobjValue = typologiesMapping.get(type).get("cobj"); |
... | ... | |
185 | 233 |
acc_class_value = "closed access"; |
186 | 234 |
break; |
187 | 235 |
} |
188 |
|
|
189 | 236 |
default: { |
190 | 237 |
acc_class_value = "not available"; |
191 | 238 |
} |
... | ... | |
222 | 269 |
} |
223 | 270 |
|
224 | 271 |
if (StringUtils.isNotBlank(doiURL)) { |
225 |
|
|
226 |
|
|
227 |
|
|
228 |
|
|
229 | 272 |
final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder(); |
230 | 273 |
instance.addUrl(doiURL); |
231 | 274 |
instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder() |
... | ... | |
266 | 309 |
|
267 | 310 |
Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement); |
268 | 311 |
|
312 |
|
|
313 |
|
|
269 | 314 |
if (authorsOrganizations.getKey().size() > 0) { |
270 | 315 |
metadata.addAllAuthor(authorsOrganizations.getKey()); |
271 | 316 |
} |
272 | 317 |
else { |
318 |
//Should never enter here becasue of the isValid method at the beginning. |
|
319 |
context.incrementCounter("filtered","unexpected_no_authors", 1); |
|
273 | 320 |
return null; |
274 | 321 |
} |
275 | 322 |
//adding Language |
... | ... | |
288 | 335 |
.setQualifier(getQualifier("keyword", "dnet:subject")) |
289 | 336 |
.build())); |
290 | 337 |
|
291 |
List<String>titles =getArrayValues(rootElement, "title");
|
|
338 |
List<String> titles = getCleanedTitles(rootElement);
|
|
292 | 339 |
titles.forEach(t-> |
293 | 340 |
metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder() |
294 | 341 |
.setValue(t) |
295 | 342 |
.setQualifier(getQualifier("main title", "dnet:dataCite_title")) |
296 | 343 |
.build())); |
344 |
|
|
297 | 345 |
settingRelevantDate(rootElement, metadata, "issued", "issued", true); |
298 | 346 |
settingRelevantDate(rootElement, metadata, "accepted", "accepted", false); |
299 | 347 |
settingRelevantDate(rootElement, metadata, "published-online", "published-online", false); |
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostImportMapper.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.mapreduce.hbase.dataimport; |
2 | 2 |
|
3 |
import java.io.IOException; |
|
4 |
import java.util.List; |
|
5 |
|
|
3 | 6 |
import com.google.common.base.Joiner; |
4 | 7 |
import com.google.gson.JsonObject; |
5 | 8 |
import com.google.gson.JsonParser; |
6 | 9 |
import eu.dnetlib.actionmanager.actions.ActionFactory; |
7 | 10 |
import eu.dnetlib.actionmanager.actions.AtomicAction; |
8 | 11 |
import eu.dnetlib.actionmanager.common.Agent; |
12 |
import eu.dnetlib.data.mapreduce.hbase.Reporter; |
|
9 | 13 |
import org.apache.hadoop.io.LongWritable; |
10 | 14 |
import org.apache.hadoop.io.Text; |
11 | 15 |
import org.apache.hadoop.mapreduce.Mapper; |
12 | 16 |
|
13 |
import java.io.IOException; |
|
14 |
import java.util.List; |
|
15 |
|
|
16 | 17 |
public class DOIBoostImportMapper extends Mapper<LongWritable, Text, Text, Text> { |
17 | 18 |
|
18 | 19 |
private String setName; |
... | ... | |
42 | 43 |
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { |
43 | 44 |
final String inputJson = value.toString(); |
44 | 45 |
final JsonObject rootElement = parser.parse(inputJson).getAsJsonObject(); |
45 |
try { |
|
46 |
List<AtomicAction> atomicActions = DOIBoostToActions.generatePublicationActionsFromDump(rootElement, factory, setName, agent, invisible, onlyOrganization); |
|
47 |
if (atomicActions!= null) { |
|
48 |
for (AtomicAction action: atomicActions){ |
|
49 |
keyout.set(Joiner.on(SEPARATOR).join(action.getTargetRowKey(), action.getTargetColumnFamily(), action.getTargetColumn())); |
|
50 |
valueOut.set(action.toJSON()); |
|
51 |
context.write(keyout, valueOut); |
|
52 |
context.getCounter(this.getClass().getSimpleName(), action.getTargetColumnFamily()).increment(1); |
|
46 |
try { |
|
47 |
List<AtomicAction> atomicActions = |
|
48 |
DOIBoostToActions.generatePublicationActionsFromDump(rootElement, factory, setName, agent, invisible, onlyOrganization, |
|
49 |
(Reporter) (counterGroup, counterName, delta) -> context.getCounter(counterGroup, counterName).increment(delta)); |
|
50 |
if (atomicActions != null) { |
|
51 |
for (AtomicAction action : atomicActions) { |
|
52 |
keyout.set(Joiner.on(SEPARATOR).join(action.getTargetRowKey(), action.getTargetColumnFamily(), action.getTargetColumn())); |
|
53 |
valueOut.set(action.toJSON()); |
|
54 |
context.write(keyout, valueOut); |
|
55 |
context.getCounter(this.getClass().getSimpleName(), action.getTargetColumnFamily()).increment(1); |
|
56 |
} |
|
53 | 57 |
} |
58 |
} catch (Throwable e) { |
|
59 |
System.err.println(inputJson); |
|
60 |
throw e; |
|
54 | 61 |
} |
55 |
} catch (Throwable e) { |
|
56 |
System.err.println(inputJson); |
|
57 |
throw e; |
|
58 | 62 |
} |
59 |
} |
|
60 | 63 |
} |
Also available in: Unified diff
Addressing quality of the research graph: #4368 and #4360.