Project

General

Profile

« Previous | Next » 

Revision 55238

Addressing quality of the research graph: #4368 and #4360.

View differences:

modules/dnet-mapreduce-jobs/trunk/src/test/java/eu/dnetlib/data/mapreduce/actions/DOIBoostToActionsTest.java
1 1
package eu.dnetlib.data.mapreduce.actions;
2 2

  
3
import java.io.*;
4
import java.util.List;
5
import java.util.zip.DataFormatException;
6
import java.util.zip.Inflater;
7

  
3 8
import com.google.common.collect.Lists;
4 9
import com.google.gson.JsonObject;
5 10
import com.google.gson.JsonParser;
6 11
import eu.dnetlib.actionmanager.actions.ActionFactory;
7 12
import eu.dnetlib.actionmanager.actions.AtomicAction;
8 13
import eu.dnetlib.actionmanager.common.Agent;
14
import eu.dnetlib.data.mapreduce.hbase.Reporter;
9 15
import eu.dnetlib.data.mapreduce.hbase.dataimport.DOIBoostToActions;
10 16
import eu.dnetlib.data.transform.Column;
11 17
import eu.dnetlib.data.transform.Row;
......
14 20
import org.junit.Before;
15 21
import org.junit.Test;
16 22

  
17
import java.io.*;
18
import java.util.List;
19
import java.util.zip.DataFormatException;
20
import java.util.zip.Inflater;
21

  
22 23
public class DOIBoostToActionsTest  {
23 24
    private String setName;
24 25
    private Agent agent;
26
    private Reporter reporter;
25 27

  
26 28

  
27

  
28 29
    @Before
29 30
    public void setup() {
30 31
        setName = "DLI";
31 32
        agent= new Agent("agentId","agentName", Agent.AGENT_TYPE.service);
33
        reporter = (Reporter) (counterGroup, counterName, delta) -> System.out.println(String.format("COUNTER: %s - %s : %d", counterGroup, counterName, delta));
32 34
    }
33 35

  
34 36
    @Test
35 37
    public void testSingleDOIBoostAction() throws IOException {
36 38
        doTestSingleDOIBoostAction("/eu/dnetlib/data/mapreduce/actions/DOIBoostAction.json");
37 39
    }
40
    @Test
41
    public void testSingleDOIBoostActionFilter() throws IOException {
42
        doTestSingleDOIBoostAction("/eu/dnetlib/data/mapreduce/actions/DOIBoostAction_filterOut.json");
43
    }
38 44

  
39 45

  
40 46
    @Test
......
94 100

  
95 101
        final JsonParser parser = new JsonParser();
96 102
        JsonObject root = parser.parse(line).getAsJsonObject();
97
        List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false);
103
        List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false,reporter);
104

  
98 105
        if (actions!= null) {
99 106
            actions.forEach(action-> {
100 107
                if (action.getTargetColumn().equals("body") && action.getTargetColumnFamily().equals("result"))
......
121 128

  
122 129
        final JsonParser parser = new JsonParser();
123 130
        JsonObject root = parser.parse(line).getAsJsonObject();
124
        List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false);
131
        List<AtomicAction> actions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false, reporter);
125 132
        if (actions!= null) {
126 133

  
127 134
            actions.forEach(it -> System.out.println(String.format(" RowKey:%s TargetColumnFamily:%s   TargetColumn: %s", it.getTargetRowKey(), it.getTargetColumnFamily(), it.getTargetColumn())));
......
141 148
            final JsonParser parser = new JsonParser();
142 149
            JsonObject root = parser.parse(line).getAsJsonObject();
143 150
            try {
144
                List<AtomicAction> atomicActions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false);
151
                List<AtomicAction> atomicActions = DOIBoostToActions.generatePublicationActionsFromDump(root, new ActionFactory(), setName, agent, false, false, reporter);
145 152
                if (atomicActions!= null)
146 153
                {
147 154
                    i ++;
modules/dnet-mapreduce-jobs/trunk/src/test/resources/eu/dnetlib/data/mapreduce/actions/DOIBoostAction_filterOut.json
1
{"publisher": "Elsevier BV", "doi": "10.1002/(sici)1098-0997(1999)7:5<248::aid-idog8>3.0.co;2-v", "hostedByOpenAire": {"name": "Infectious Diseases in Obstetrics and Gynecology", "id": "doajarticles::1064-7449"}, "license": [{"url": "http://doi.wiley.com/10.1002/tdm_license_1.1", "content-version": "tdm", "\"delay-in-days": null, "date-time": "2015-09-01T00:00:00Z"}, {"url": "https://doi.org/10.1002/(sici)1098-0997(1999)7:5<248::aid-idog8>3.0.co;2-v", "provenance": "UnpayWall", "access-rights": "OPEN"}], "title": ["Pelvic inflammatory disease in the postmenopausal woman"], "issued": "1999-1-1", "abstract": [{"provenance": "MAG", "value": "eJx9kk1vFDEMhv+KjyBtR2JFC7vXtqBKLUVcOHsynhmjTBzFzqzm3+PsAuUAvSRx5I/Hr/3c/6Bg\nvNIRvtHKdAJckSP2kSCyUUGrhUASZIorB+A0RlwWNCkbDKyESv4JWdQWSpKxKkY4iRsd3JHylI7w\ndH/3+PDl/u+U5VJuLLLAu8PNDZj4fTh0DqI1mh7h62slFWoKsiyO9t/yD9b8ssRt4VCkZ4w7kNEo\ntf8gKdRSKBmc2Gaw2ousWBgTYK+BVGGU4oVZ0g4wDS3qEo6qEhiNhkus2EzF4XBKoqQd3HryWNUj\nvY9/wF3CXhMVXaSe1MAKvRTilJ2nMWds6C5ndDTjPx3qmRRzLpJLQwRecOI0gVoduMF98hG3Gbjk\nhTSL+/uzZelZjIOCzlLjABuTnwhRTmCz+87itosCWstEZbvI4koqD22y3q8rBBgdLGFbrBdRfift\nCc7ghpxo8Cml0XfQl4Ud7blXIzc+b4mCxA4+HPfvP17tr/e7X+vx5vbt+fWdI21Xj6y68xSh+wkk\nbARZ\n"}], "issn": [{"type": "print", "value": "1064-7449"}, {"type": "electronic", "value": "1098-0997"}], "doi-url": "http://dx.doi.org/10.1002/(sici)1098-0997(1999)7:5<248::aid-idog8>3.0.co;2-v", "instances": [{"url": "https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002%2F(SICI)1098-0997(1999)7:5%3C248::AID-IDOG8%3E3.0.CO;2-V", "provenance": "CrossRef", "access-rights": "UNKNOWN"}, {"url": "https://onlinelibrary.wiley.com/doi/full/10.1002/(SICI)1098-0997(1999)7:5%3C248::AID-IDOG8%3E3.0.CO;2-V", "provenance": "CrossRef", "access-rights": "UNKNOWN"}], "authors": [{"affiliations": [{"official-page": "http://www.musc.edu/", "provenance": "MAG", "value": "Medical University of South Carolina", "identifiers": [{"value": "http://en.wikipedia.org/wiki/Medical_University_of_South_Carolina", "schema": "wikpedia"}, {"value": "grid.259828.c", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/153297377", "schema": "URL"}]}], "given": "Addie", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2517302922", "schema": "URL"}], "fullname": "", "family": "Jackson"}, {"affiliations": [{"official-page": "http://www.musc.edu/", "provenance": "MAG", "value": "Medical University of South Carolina", "identifiers": [{"value": "http://en.wikipedia.org/wiki/Medical_University_of_South_Carolina", "schema": "wikpedia"}, {"value": "grid.259828.c", "schema": "grid.ac"}, {"value": "https://academic.microsoft.com/#/detail/153297377", "schema": "URL"}]}], "given": "D.E.", "identifiers": [{"provenance": "MAG", "value": "https://academic.microsoft.com/#/detail/2163996951", "schema": "URL"}], "fullname": "D.E. Soper", "family": "Soper"}], "published-print": "1999-1-1", "collectedFrom": ["CrossRef", "MAG", "UnpayWall"], "accepted": null, "type": "journal-article", "published-online": null, "subject": ["Obstetrics and Gynaecology", "Infectious Diseases", "Dermatology"]}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/Reporter.java
1
package eu.dnetlib.data.mapreduce.hbase;
2

  
3
import java.io.Serializable;
4

  
5
/**
6
 * Created by Alessia Bardi on 2019-04-08.
7
 *
8
 * @author Alessia Bardi
9
 */
10
public interface Reporter extends Serializable {
11

  
12
	void incrementCounter(String counterGroup, String counterName, long delta);
13

  
14
}
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostToActions.java
1 1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2 2

  
3
import java.io.ByteArrayOutputStream;
4
import java.io.IOException;
5
import java.io.InputStream;
6
import java.util.*;
7
import java.util.concurrent.atomic.AtomicInteger;
8
import java.util.function.Function;
9
import java.util.stream.Collectors;
10
import java.util.zip.Inflater;
11

  
3 12
import com.google.gson.Gson;
4 13
import com.google.gson.JsonElement;
5 14
import com.google.gson.JsonObject;
6
import com.googlecode.protobuf.format.JsonFormat;
7 15
import eu.dnetlib.actionmanager.actions.ActionFactory;
8 16
import eu.dnetlib.actionmanager.actions.AtomicAction;
9 17
import eu.dnetlib.actionmanager.common.Agent;
18
import eu.dnetlib.data.mapreduce.hbase.Reporter;
10 19
import eu.dnetlib.data.mapreduce.util.StreamUtils;
11 20
import eu.dnetlib.data.proto.*;
12 21
import eu.dnetlib.data.transform.xml.AbstractDNetXsltFunctions;
......
15 24
import org.apache.commons.io.IOUtils;
16 25
import org.apache.commons.lang3.StringUtils;
17 26

  
18
import java.io.ByteArrayOutputStream;
19
import java.io.IOException;
20
import java.io.InputStream;
21
import java.util.*;
22
import java.util.concurrent.atomic.AtomicInteger;
23
import java.util.function.Function;
24
import java.util.stream.Collectors;
25
import java.util.zip.Inflater;
26

  
27 27
import static eu.dnetlib.data.mapreduce.hbase.dataimport.DumpToActionsUtility.*;
28 28
import static eu.dnetlib.data.proto.ResultOrganizationProtos.ResultOrganization;
29 29

  
......
89 89
        }
90 90
    }
91 91

  
92
    protected static boolean isValid(final JsonObject rootElement, final Reporter context){
92 93

  
94
        final String doi = getStringValue(rootElement, "doi");
95
        if (doi == null) {
96
            context.incrementCounter("filtered","no_doi", 1);
97
            return false;
98
        }
99
        final String type = getStringValue(rootElement,"type");
100
        if (!typologiesMapping.containsKey(type)){
101
            context.incrementCounter("filtered","unknowntype_"+type, 1);
102
            return false;
103
        }
104
        // fixes #4360 (test publisher)
105
        final String publisher = getStringValue(rootElement, "publisher");
106
        if(publisher.equalsIgnoreCase("Test accounts")){
107
            context.incrementCounter("filtered","test_publisher", 1);
108
            return false;
109
        }
93 110

  
111
        List<JsonObject> authors = getArrayObjects(rootElement, "authors");
112
        boolean hasAuthor = false;
113
        for(JsonObject author : authors){
114
            final String given = getStringValue(author, "given");
115
            final String family = getStringValue(author, "family");
116
            final  String fullname = getStringValue(author, "fullname");
117
            if (StringUtils.isNotBlank(fullname) || (StringUtils.isNotBlank(given) && StringUtils.isNotBlank(family))) {
118
                hasAuthor = true;
119
            }
120
            // fixes #4368
121
            if(StringUtils.isNotBlank(given) && given.equalsIgnoreCase("Addie") && StringUtils.isNotBlank(family) && family.equalsIgnoreCase("Jackson") && publisher.equalsIgnoreCase("Elsevier BV")){
122
                context.incrementCounter("filtered","addieJackson", 1);
123
                return false;
124
            }
125
        }
126
        if(!hasAuthor){
127
            context.incrementCounter("filtered","no_authors", 1);
128
            return false;
129
        }
130
        // fixes #4360
131
        if(getCleanedTitles(rootElement).isEmpty()){
132
            context.incrementCounter("filtered","invalid_title", 1);
133
            return false;
134
        }
94 135

  
136
        return true;
137
    }
138

  
139
    private static List<String> getCleanedTitles(final JsonObject rootElement){
140
        List<String> titles = getArrayValues(rootElement, "title");
141
        return titles.stream().filter( t -> StringUtils.isNotBlank(t) && !t.equalsIgnoreCase("[NO TITLE AVAILABLE]")).collect(Collectors.toList());
142
    }
143

  
144

  
95 145
    public static List<AtomicAction> generatePublicationActionsFromDump(final JsonObject rootElement, final ActionFactory factory, final String setName, final Agent agent, boolean invisible,
96
                                                                        final boolean onlyOrganization) {
146
                                                                        final boolean onlyOrganization, final Reporter context) {
97 147

  
148
        if(!isValid(rootElement, context)) return null;
149

  
98 150
        //Create OAF Proto
99 151

  
100 152
        final OafProtos.Oaf.Builder oaf = OafProtos.Oaf.newBuilder();
......
135 187
        }
136 188
        //Adding identifier
137 189
        final String doi = getStringValue(rootElement, "doi");
138
        if (doi == null)
139
            return null;
140 190
        entity.addOriginalId(doi);
191

  
141 192
        final String sourceId = String.format("50|%s" + SEPARATOR + "%s", doiBoostNSPREFIX, AbstractDNetXsltFunctions.md5(doi));
142 193
        entity.setId(sourceId);
143 194

  
......
152 203

  
153 204
        final String type = getStringValue(rootElement,"type");
154 205

  
155
        if (!typologiesMapping.containsKey(type))
156
            return null;
157

  
158 206
        //Adding Instances
159 207
        final String typeValue = typologiesMapping.get(type).get("value");
160 208
        final String cobjValue = typologiesMapping.get(type).get("cobj");
......
185 233
                    acc_class_value = "closed access";
186 234
                    break;
187 235
                }
188

  
189 236
                default: {
190 237
                    acc_class_value = "not available";
191 238
                }
......
222 269
        }
223 270

  
224 271
        if (StringUtils.isNotBlank(doiURL)) {
225

  
226

  
227

  
228

  
229 272
            final ResultProtos.Result.Instance.Builder instance = ResultProtos.Result.Instance.newBuilder();
230 273
            instance.addUrl(doiURL);
231 274
            instance.setInstancetype(FieldTypeProtos.Qualifier.newBuilder()
......
266 309

  
267 310
        Pair<List<FieldTypeProtos.Author>, Collection<OafProtos.Oaf>> authorsOrganizations = createAuthorsOrganization(rootElement);
268 311

  
312

  
313

  
269 314
        if (authorsOrganizations.getKey().size() > 0) {
270 315
            metadata.addAllAuthor(authorsOrganizations.getKey());
271 316
        }
272 317
        else {
318
            //Should never enter here becasue of the isValid method at the beginning.
319
            context.incrementCounter("filtered","unexpected_no_authors", 1);
273 320
            return null;
274 321
        }
275 322
        //adding Language
......
288 335
                .setQualifier(getQualifier("keyword", "dnet:subject"))
289 336
                .build()));
290 337

  
291
        List<String>titles =getArrayValues(rootElement, "title");
338
        List<String> titles = getCleanedTitles(rootElement);
292 339
        titles.forEach(t->
293 340
                metadata.addTitle(FieldTypeProtos.StructuredProperty.newBuilder()
294 341
                        .setValue(t)
295 342
                        .setQualifier(getQualifier("main title", "dnet:dataCite_title"))
296 343
                        .build()));
344

  
297 345
        settingRelevantDate(rootElement, metadata, "issued", "issued", true);
298 346
        settingRelevantDate(rootElement, metadata, "accepted", "accepted", false);
299 347
        settingRelevantDate(rootElement, metadata, "published-online", "published-online", false);
modules/dnet-mapreduce-jobs/trunk/src/main/java/eu/dnetlib/data/mapreduce/hbase/dataimport/DOIBoostImportMapper.java
1 1
package eu.dnetlib.data.mapreduce.hbase.dataimport;
2 2

  
3
import java.io.IOException;
4
import java.util.List;
5

  
3 6
import com.google.common.base.Joiner;
4 7
import com.google.gson.JsonObject;
5 8
import com.google.gson.JsonParser;
6 9
import eu.dnetlib.actionmanager.actions.ActionFactory;
7 10
import eu.dnetlib.actionmanager.actions.AtomicAction;
8 11
import eu.dnetlib.actionmanager.common.Agent;
12
import eu.dnetlib.data.mapreduce.hbase.Reporter;
9 13
import org.apache.hadoop.io.LongWritable;
10 14
import org.apache.hadoop.io.Text;
11 15
import org.apache.hadoop.mapreduce.Mapper;
12 16

  
13
import java.io.IOException;
14
import java.util.List;
15

  
16 17
public class DOIBoostImportMapper extends Mapper<LongWritable, Text, Text, Text> {
17 18

  
18 19
    private String setName;
......
42 43
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
43 44
        final String inputJson = value.toString();
44 45
        final JsonObject rootElement = parser.parse(inputJson).getAsJsonObject();
45
        try {
46
            List<AtomicAction> atomicActions = DOIBoostToActions.generatePublicationActionsFromDump(rootElement, factory, setName, agent, invisible, onlyOrganization);
47
            if (atomicActions!= null) {
48
                for (AtomicAction action: atomicActions){
49
                    keyout.set(Joiner.on(SEPARATOR).join(action.getTargetRowKey(), action.getTargetColumnFamily(), action.getTargetColumn()));
50
                    valueOut.set(action.toJSON());
51
                    context.write(keyout, valueOut);
52
                    context.getCounter(this.getClass().getSimpleName(), action.getTargetColumnFamily()).increment(1);
46
            try {
47
                List<AtomicAction> atomicActions =
48
                        DOIBoostToActions.generatePublicationActionsFromDump(rootElement, factory, setName, agent, invisible, onlyOrganization,
49
                                (Reporter) (counterGroup, counterName, delta) -> context.getCounter(counterGroup, counterName).increment(delta));
50
                if (atomicActions != null) {
51
                    for (AtomicAction action : atomicActions) {
52
                        keyout.set(Joiner.on(SEPARATOR).join(action.getTargetRowKey(), action.getTargetColumnFamily(), action.getTargetColumn()));
53
                        valueOut.set(action.toJSON());
54
                        context.write(keyout, valueOut);
55
                        context.getCounter(this.getClass().getSimpleName(), action.getTargetColumnFamily()).increment(1);
56
                    }
53 57
                }
58
            } catch (Throwable e) {
59
                System.err.println(inputJson);
60
                throw e;
54 61
            }
55
        } catch (Throwable e) {
56
            System.err.println(inputJson);
57
            throw e;
58 62
        }
59
    }
60 63
}

Also available in: Unified diff