Project

General

Profile

« Previous | Next » 

Revision 45625

Added by Eri Katsari about 7 years ago

Refactored Build accoring to new parsing.

View differences:

modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/test/java/BuildTest.java
25 25
        BlockReducer blockReducer = new BlockReducer();
26 26

  
27 27
        Configuration configuration = new Configuration();
28
        configuration.set("lod.sourceMappings", "{\"result\":[\"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\"http://purl.org/dc/terms/identifier\",\"http://lod.openaire.eu/vocab/dateOfTransformation\",\"http://lod.openaire.eu/vocab/dateOfCollection\",\"http://purl.org/dc/terms/identifier\",\"http://www.eurocris.org/ontologies/cerif/1.3#name\",\"http://purl.org/dc/terms/dateAccepted\",\"http://purl.org/dc/terms/publisher\",\"http://purl.org/dc/terms/identifier\",\"http://purl.org/dc/terms/language\",\"http://purl.org/dc/terms/date\",\"http://lod.openaire.eu/vocab/resultSubject\",\"http://lod.openaire.eu/vocab/externalReference\",\"http://purl.org/dc/terms/source\",\"http://purl.org/dc/terms/format\",\"http://lod.openaire.eu/vocab/context\",\"http://dbpedia.org/ontology/country\",\"http://purl.org/dc/terms/accessRights\",\"http://purl.org/dc/terms/description\",\"http://lsdis.cs.uga.edu/projects/semdis/opus#journal_name\",\"http://lod.openaire.eu/vocab/dataSourceType\",\"http://lod.openaire.eu/vocab/device\",\"http://lod.openaire.eu/vocab/size\",\"http://lod.openaire.eu/vocab/version\",\"http://lod.openaire.eu/vocab/lastMetadataUpdate\",\"http://lod.openaire.eu/vocab/metadataVersion\",\"http://lod.openaire.eu/vocab/year\",\"http://lod.openaire.eu/vocab/resultType\"],\"project\": [\"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\"http://purl.org/dc/terms/identifier\",\"http://lod.openaire.eu/vocab/dateOfTransformation\",\"http://lod.openaire.eu/vocab/dateOfCollection\",\"http://purl.org/dc/terms/identifier\",\"http://lod.openaire.eu/vocab/projectCode\",\"http://schema.org/url\",\"http://www.eurocris.org/ontologies/cerif/1.3#acronym\",\"http://www.eurocris.org/ontologies/cerif/1.3#name\",\"http://www.eurocris.org/ontologies/cerif/1.3#startDate\",\"http://www.eurocris.org/ontologies/cerif/1.3#endDate\",\"http://purl.org/cerif/frapo/hasCallIdentifier\",\"http://www.eurocris.org/ontologies/cerif/1.3#keyword\",\"http://www.w3.org/2006/time#hasDurationDescription\",\"http://lod.openaire.eu/vocab/ec_SC39\",\"http://lod.openaire.eu/vocab/contractType\",\"http://lod.openaire.eu/vocab/oaMandatePublications\",\"http://lod.openaire.eu/vocab/projectSubjects\",\"http://od.openaire.eu/vocab/ec_article29-3\",\"http://lod.openaire.eu/vocab/funder\",\"http://lod.openaire.eu/vocab/fundingLevel0\",\"http://lod.openaire.eu/vocab/fundingLevel1\",\"http://lod.openaire.eu/vocab/fundingLevel2\",\"http://lod.openaire.eu/vocab/fundingLevel3\"],\"person\": [\"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\"http://purl.org/dc/terms/identifier\",\"http://lod.openaire.eu/vocab/dateOfTransformation\",\"http://lod.openaire.eu/vocab/dateOfCollection\",\"http://purl.org/dc/terms/identifier\", \"http://xmlns.com/foaf/0.1/firstName\",\"http://xmlns.com/foaf/0.1/lastName\", \"http://xmlns.com/foaf/0.1/name\",\"http://schema.org/faxNumber\",\"http://xmlns.com/foaf/0.1/mbox\",\"http://xmlns.com/foaf/0.1/phone\", \"http://schema.org/nationality\",\"http://purl.org/dc/terms/identifier\", \"http://lod.openaire.eu/vocab/trust\"],\"organization\": [\"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\"http://purl.org/dc/terms/identifier\",\"http://lod.openaire.eu/vocab/dateOfTransformation\",\"http://lod.openaire.eu/vocab/dateOfCollection\",\"http://purl.org/dc/terms/identifier\",\"http://www.w3.org/2004/02/skos/core#altLabel\",\"http://www.w3.org/2004/02/skos/core#prefLabel\",\"http://lod.openaire.eu/vocab/webSiteUrl\",\"http://xmlns.com/foaf/0.1/logo\",\"http://dbpedia.org/ontology/country\",\"http://lod.openaire.eu/vocab/entityType\" ],\"datasource\":[\"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\"http://purl.org/dc/terms/identifier\",\"http://lod.openaire.eu/vocab/dateOfTransformation\",\"http://lod.openaire.eu/vocab/dateOfCollection\",\"http://purl.org/dc/terms/identifier\",\"http://lod.openaire.eu/vocab/datasourceType\",\"http://lod.openaire.eu/vocab/openAIRECompatibility\",\"http://dbpedia.org/ontology/officialName\",\"http://lod.openaire.eu/vocab/englishName\",\"http://schema.org/url\",\"http://xmlns.com/foaf/0.1/logo\",\"http://xmlns.com/foaf/0.1/mbox\",\"http://purl.org/vocab/vann/preferredNamespacePrefix\",\"http://www.w3.org/2003/01/geo/wgs84_pos#lat\",\"http://www.w3.org/2003/01/geo/wgs84_pos#long\",\"http://lod.openaire.eu/vocab/dateOfValidity\",\"http://purl.org/dc/terms/description\",\"http://lod.openaire.eu/vocab/subjectList\",\"http://lod.openaire.eu/numberOfItems\",\"http://purl.org/dc/terms/date\",\"http://lod.openaire.eu/vocab/policies\",\"http://lod.openaire.eu/vocab/languages\",\"http://lod.openaire.eu/vocab/contentType\",\"http://lod.openaire.eu/vocab/accessInfoPackage\",\"http://lod.openaire.eu/vocab/releaseStartDate\",\"http://lod.openaire.eu/vocab/releaseEndDate\",\"http://lod.openaire.eu/vocab/missionStatementUrl\",\"http://www.europeana.eu/schemas/edm/dataProvider\",\"http://lod.openaire.eu/vocab/serviceProvider\",\"http://lod.openaire.eu/vocab/databaseAccessType\",\"http://lod.openaire.eu/vocab/dataUploadType\",\"http://lod.openaire.eu/vocab/dataUploadRestrictions\",\"http://lod.openaire.eu/vocab/versioning\",\"http://lod.openaire.eu/vocab/citationGuidelineUrl\",\"http://lod.openaire.eu/vocab/qualityManagementKind\",\"http://lod.openaire.eu/vocab/pidSystems\",\"http://lod.openaire.eu/vocab/certificates\",\"http://purl.org/dc/terms/accessRights\"]}");
29
        configuration.set("lod.redisHost", "194.177.192.118");
28
        String sourceMappings = "{\"type\":\"result\", \"fields\":[\"<http://purl.org/dc/terms/identifier>\",\"<http://www.eurocris.org/ontologies/cerif/1.3#name>\",\"<http://lod.openaire.eu/vocab/year>\"]}";
29
        configuration.set("lod.redisHost", "83.212.96.39");
30
        configuration.set("lod.sourceMappings", sourceMappings);
31

  
32
        String stopwords = "a,able,about,above,abst,accordance,according,accordingly,across,act,actually,added,adj,affected,affecting,affects,after,afterwards,again,against,ah,all,almost,alone,along,already,also,although,always,am,among,amongst,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,apparently,approximately,are,aren,arent,arise,around,as,aside,ask,asking,at,auth,available,away,awfully,b,back,be,became,because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe,below,beside,besides,between,beyond,biol,both,brief,briefly,but,by,c,ca,came,can,cannot,cant,cause,causes,certain,certainly,co,com,come,comes,contain,containing,contains,could,couldnt,d,date,did,didnt,different,do,does,doesnt,doing,done,dont,down,downwards,due,during,e,each,ed,edu,effect,eg,eight,eighty,either,else,elsewhere,end,ending,enough,especially,et,et-al,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,except,f,far,few,ff,fifth,first,five,fix,followed,following,follows,for,former,formerly,forth,found,four,from,further,furthermore,g,gave,get,gets,getting,give,given,gives,giving,go,goes,gone,got,gotten,h,had,happens,hardly,has,hasnt,have,havent,having,he,hed,hence,her,here,hereafter,hereby,herein,heres,hereupon,hers,herself,hes,hi,hid,him,himself,his,hither,home,how,howbeit,however,hundred,i,id,ie,if,ill,im,immediate,immediately,importance,important,in,inc,indeed,index,information,instead,into,invention,inward,is,isnt,it,itd,itll,its,itself,ive,j,just,k,keep,      keeps,kept,kg,km,know,known,knows,l,largely,last,lately,later,latter,latterly,least,less,lest,let,lets,like,liked,likely,line,little,ll,look,looking,looks,ltd,m,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might,million,miss,ml,more,moreover,most,mostly,mr,mrs,much,mug,must,my,myself,n,na,name,namely,nay,nd,near,nearly,necessarily,necessary,need,needs,neither,never,nevertheless,new,next,nine,ninety,no,nobody,non,none,nonetheless,noone,nor,normally,nos,not,noted,nothing,now,nowhere,o,obtain,obtained,obviously,of,off,often,oh,ok,okay,old,omitted,on,once,one,ones,only,onto,or,ord,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,owing,own,p,page,pages,part,particular,particularly,past,per,perhaps,placed,please,plus,poorly,possible,possibly,potentially,pp,predominantly,present,previously,primarily,probably,promptly,proud,provides,put,q,que,quickly,quite,qv,r,ran,rather,rd,re,readily,really,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research,respectively,resulted,resulting,results,right,run,s,said,same,saw,say,saying,says,sec,section,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sent,seven,several,shall,she,shed,shell,shes,should,shouldnt,show,showed,shown,showns,shows,significant,significantly,similar,similarly,since,six,slightly,so,some,somebody,somehow,someone,somethan,something,sometime,sometimes,somewhat,somewhere,soon,sorry,specifically,specified,specify,specifying,still,stop,strongly,sub,substantially,successfully,such,sufficiently,suggest,sup,sure,t,take,taken,taking,tell,tends,th,than,thank,thanks,thanx,that,thatll,thats,thatve,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein,therell,thereof,therere,theres,thereto,thereupon,thereve,these,they,theyd,theyll,theyre,theyve,think,this,those,thou,though,thoughh,thousand,throug,through,throughout,thru,thus,til,tip,to,together,too,took,toward,towards,tried,tries,truly,try,trying,ts,twice,two,u,un,under,unfortunately,unless,unlike,unlikely,until,unto,up,upon,ups,us,use,used,useful,usefully,usefulness,uses,using,usually,v,value,various,ve,very,via,viz,vol,vols,vs,w,want,wants,was,wasnt,way,we,wed,welcome,well,went,were,werent,weve,what,whatever,whatll,whats,when,whence,whenever,where,whereafter,whereas,whereby,wherein,wheres,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,wholl,whom,whomever,whos,whose,why,widely,willing,wish,with,within,without,wont,words,world,would,wouldnt,www,x,y,yes,yet,you,youd,youll,your,youre,yours,yourself,yourselves,youve,z,zero\n";
33
        configuration.set("lod.stopwords", stopwords);
34

  
30 35
        configuration.set("lod.redisPort", "6379");
31 36

  
37
        String targetMappings = "{\"type\":\"publications\",\"fields\":[\"<http://purl.org/dc/terms/issued>\",\"<http://www.w3.org/2000/01/rdf-schema#label>\",\"<http://purl.org/dc/terms/identifier>\"]}";
38
        configuration.set("lod.targetMappings", targetMappings);
39

  
32 40
        sourceMapDriver = MapDriver.newMapDriver(sourceBuildMapper).withConfiguration(configuration);
33 41
        targetMapDriver = MapDriver.newMapDriver(targetBuildMapper).withConfiguration(configuration);
34 42

  
......
47 55
    @Test
48 56
    public void testTargetMapper() throws IOException {
49 57

  
50
        targetMapDriver.withInput(new LongWritable(1), new Text("id,<http://dblp.l3s.de/d2r/resource/publications/journals/advai/Luis-GarciaP16>,<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>,<http://xmlns.com/foaf/0.1/Document>\n"));
58
        targetMapDriver.withInput(new LongWritable(1), new Text("<http://dblp.l3s.de/d2r/resource/publications/books/acm/0082477>\t<http://purl.org/dc/terms/issued>\t\"1992\"\t.\t<http://dblp.l3s.de/d2r/resource/publications/books/acm/0082477>\t<http://www.w3.org/2000/01/rdf-schema#label>\t\"The no-nonsense guide to computing careers.\"\t.\t"));
51 59
        targetMapDriver.run();
52 60
    }
53 61

  
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/test/java/PreprocessingTest.java
39 39
        Configuration configuration = new Configuration();
40 40
        String sourceMappings = "{\"type\":\"result\", \"fields\":[\"<http://purl.org/dc/terms/identifier>\",\"<http://www.eurocris.org/ontologies/cerif/1.3#name>\",\"<http://lod.openaire.eu/vocab/year>\"]}";
41 41
        configuration.set("lod.sourceMappings", sourceMappings);
42

  
43 42
        mapDriver = MapDriver.newMapDriver(new SourceMapper());
44 43
        mapDriver.withConfiguration(configuration);
45 44
        mapDriver.withInput(new LongWritable(1),  new Text("<http://lod.openaire.eu/data/result/od_______908::bbaecb13949279cda128a66545446b76> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.eurocris.org/ontologies/cerif/1.3#ResultEntity> .\n"))
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/SourceBuildMapper.java
9 9
import org.apache.hadoop.io.Text;
10 10
import org.apache.hadoop.mapreduce.Mapper;
11 11
import org.apache.log4j.Logger;
12
import org.matheclipse.core.reflection.system.E;
12 13

  
13 14
import java.io.BufferedWriter;
14 15
import java.io.IOException;
......
47 48
    private String uriPrefix;
48 49
    private String stopWords;
49 50
    private Map<String, Integer> stopWordsMap = new HashMap();
51
    private static final String LINE_DELIM="\t.\t";
52
    private static final String FIELD_DELIM="\t";
50 53

  
51

  
52 54
    public static enum SOURCE_BUILD_COUNTERS {
53 55
        BLOCKING_KEYS,
54 56
        REDIS_RECORDS
......
57 59

  
58 60
    @Override
59 61
    protected void setup(Context context) throws IOException, InterruptedException {
60
        lodConfiguration = new LodConfiguration();
61
        lodConfiguration.load(context.getConfiguration().get("lod.sourceMappings"));
62
        redisHost = context.getConfiguration().get("lod.redisHost");
63
        redisPort = Integer.parseInt(context.getConfiguration().get("lod.redisPort"));
64
        log.debug("Redis connection info : " + "redis://" + redisHost + ":" + redisPort);
65
        client = RedisClient.create("redis://" + redisHost + ":" + redisPort);
66
        connection = client.connect();
67
        uriPrefix = context.getConfiguration().get("lod.prefix");
68
        stopWords = context.getConfiguration().get("lod.stopwords");
62
        try {
63
            lodConfiguration = new LodConfiguration();
64
            lodConfiguration.load(context.getConfiguration().get("lod.sourceMappings"));
65
            redisHost = context.getConfiguration().get("lod.redisHost");
66
            redisPort = Integer.parseInt(context.getConfiguration().get("lod.redisPort"));
67
            log.debug("Redis connection info : " + "redis://" + redisHost + ":" + redisPort);
68
            client = RedisClient.create("redis://" + redisHost + ":" + redisPort);
69
            connection = client.connect();
70
            uriPrefix = context.getConfiguration().get("lod.prefix");
71
            stopWords = context.getConfiguration().get("lod.stopwords");
69 72

  
70
        for (String stopword : stopWords.split(",")) {
71
            stopWordsMap.put(stopword, 0);
73
            for (String stopword : stopWords.split(",")) {
74
                stopWordsMap.put(stopword, 0);
75
            }
76
            System.out.println("Stopwords size " + stopWordsMap.size());
77
            log.debug("Stopwords size " + stopWordsMap.size());
78
        } catch (Exception ex) {
79
            log.error("An error occured during Mapper Setup " + ex.toString(), ex);
80
            System.out.println(ex.getCause().toString());
72 81
        }
73 82
    }
74 83

  
......
78 87
        try {
79 88

  
80 89
            StringBuilder id = new StringBuilder();
81
            String[] triples = result.toString().split(".");
82
            for(String triple:triples){
83
                String [] fields=triple.split("\t");
84
                if(id.length()<1){
85
                    id=id.append("source_").append(fields[0]);
90
            String[] triples = result.toString().split(LINE_DELIM);
91
            for (String triple : triples) {
92
                String[] fields = triple.split(FIELD_DELIM);
93
                if (id.length() < 1) {
94
                    id = id.append("source_").append(fields[0]);
86 95
                }
87 96

  
88
                String property=fields[1];
89
                String value=fields[2];
97
                String property = fields[1];
98
                String value = fields[2];
90 99
                List<String> blockingKeys = Blocking.tokenBlocking(value, stopWordsMap);
91 100
                for (String blockingKey : blockingKeys) {
92 101
                    //Write BlockingKey, RecordID to output
......
94 103
                    context.getCounter(SOURCE_BUILD_COUNTERS.BLOCKING_KEYS).increment(1);
95 104
                }
96 105
            }
97
            writeToRedis(id.toString(), result.toString(),context);
106
            writeToRedis(id.toString(), result.toString(), context);
98 107
        } catch (Exception e) {
99 108
            log.error("Error writing entity to M/R output", e);
100 109
            log.error("result error    " + result.toString());
......
105 114
    }
106 115

  
107 116

  
108
    private void writeToRedis(String key, String value,Context context) throws Exception {
117
    private void writeToRedis(String key, String value, Context context) throws Exception {
109 118

  
110 119
        try {
111 120
            connection.set(key, value);
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/build/TargetBuildMapper.java
38 38
    private RedisClient client;
39 39
    private String redisHost;
40 40
    private Integer redisPort;
41

  
42 41
    private FileSystem hdfs;
43
    private OutputStream os;
44
    private BufferedWriter br;
45

  
46 42
    private String uriPrefix;
47 43
    private String stopWords;
48 44
    private Map<String, Integer> stopWordsMap = new HashMap<>();
49
    private String entityType;
45
    private static final String LINE_DELIM="\t.\t";
46
    private static final String FIELD_DELIM="\t";
50 47

  
51 48
    public static enum TARGET_BUILD_COUNTERS {
52 49

  
......
58 55
    @Override
59 56
    protected void setup(Context context) throws IOException, InterruptedException {
60 57

  
61
        lodConfiguration = new LodConfiguration();
62
        lodConfiguration.load(context.getConfiguration().get("lod.targetMappings"));
63
        redisHost = context.getConfiguration().get("lod.redisHost");
64
        redisPort = Integer.parseInt(context.getConfiguration().get("lod.redisPort"));
65
        client = RedisClient.create("redis://" + redisHost + ":" + redisPort);
66
        log.debug("Redis connection info : " + "redis://" + redisHost + ":" + redisPort);
58
        try {
59
            lodConfiguration = new LodConfiguration();
67 60

  
68
        connection = client.connect();
69
        uriPrefix = context.getConfiguration().get("lod.prefix");
70
        stopWords = context.getConfiguration().get("lod.stopwords");
71
        for (String stopword : stopWords.split(",")) {
72
            stopWordsMap.put(stopword, 0);
61
            lodConfiguration.load(context.getConfiguration().get("lod.targetMappings"));
62
            redisHost = context.getConfiguration().get("lod.redisHost");
63
            redisPort = Integer.parseInt(context.getConfiguration().get("lod.redisPort"));
64
            client = RedisClient.create("redis://" + redisHost + ":" + redisPort);
65
            log.debug("Redis connection info : " + "redis://" + redisHost + ":" + redisPort);
66

  
67
            connection = client.connect();
68
            uriPrefix = context.getConfiguration().get("lod.prefix");
69
            stopWords = context.getConfiguration().get("lod.stopwords");
70
            for (String stopword : stopWords.split(",")) {
71
                stopWordsMap.put(stopword, 0);
72
            }
73

  
74
            log.info("Stopwords size " + stopWordsMap.size());
75
            System.out.println("Stopwords size " + stopWordsMap.size());
76

  
77
        } catch (Exception ex) {
78
            log.error("An error occured during Mapper Setup " + ex.toString(), ex);
79
            System.out.println(ex.getCause().toString());
73 80
        }
74 81

  
75 82
    }
......
77 84

  
78 85
    @Override
79 86
    protected void map(final LongWritable keyIn, final Text result, final Context context) throws IOException {
80

  
81 87
        try {
82 88
            //get ID
83 89
            StringBuilder id = new StringBuilder();
84
            String[] triples = result.toString().split(".");
85
            for(String triple:triples){
86
                String [] fields=triple.split("\t");
87

  
88
                if(id.length()<1){
89
                    id=id.append("target_").append(fields[0]);
90
            String[] triples = result.toString().split(LINE_DELIM);
91
            for (String triple : triples) {
92
                String[] fields = triple.split(FIELD_DELIM);
93
                if (id.length()<1) {
94
                      id.append("target_").append(fields[0]);
90 95
                }
91 96

  
92
                String value=fields[2];
97
                String value = fields[2];
93 98
                List<String> blockingKeys = Blocking.tokenBlocking(value, stopWordsMap);
94 99
                for (String blockingKey : blockingKeys) {
95 100
                    //Write BlockingKey, RecordID to output
......
97 102
                    context.getCounter(TARGET_BUILD_COUNTERS.BLOCKING_KEYS).increment(1);
98 103
                }
99 104
            }
100
            writeToRedis(id.toString(), result.toString(),context);
105
            writeToRedis(id.toString(), result.toString(), context);
101 106
        } catch (Exception e) {
102 107
            log.error("Error writing entity to M/R output", e);
103 108
            log.error("result error    " + result.toString());
......
106 111

  
107 112
    }
108 113

  
109
    private void writeToRedis(String key, String value,Context context) throws Exception {
114
    private void writeToRedis(String key, String value, Context context) throws Exception {
110 115

  
111 116
        try {
112 117
            connection.set(key, value);
modules/dnet-openaire-lodinterlinking/branches/cacheOptimized/src/main/java/eu/dnetlib/data/mapreduce/hbase/lodExport/utils/Blocking.java
18 18

  
19 19
        List<String> blockingKeys = new ArrayList<>();
20 20
        Map<String, Integer> blockingKeysMap = new HashMap<>();
21
        System.out.println("generated tokens " + tokens);
21 22

  
22 23
        for (int j = 0; j < tokens.length; j++) {
23 24
            String currentToken = tokens[j];

Also available in: Unified diff