/ - Diff - D-Net - D-Net project tracking tool

     package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
     import java.util.HashMap;
     import java.util.Map.Entry;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import org.bson.BsonDocument;
     import org.bson.BsonDocumentWrapper;
     import org.bson.codecs.configuration.CodecRegistry;
     import org.bson.conversions.Bson;
     /**
      * Created by claudio on 07/03/16.
      */
     public class SubjectsMap extends HashMap<String, Subjects> {
     	private static final Log log = LogFactory.getLog(SubjectsMap.class);
     	public SubjectsMap mergeFrom(SubjectsMap sm) {
     		if (sm != null) {
     			for (Entry<String, Subjects> e : sm.entrySet()) {
     				if (!this.containsKey(e.getKey())) {
     					Subjects sub = new Subjects();
     					sub.addAll(e.getValue());
     					this.put(e.getKey(), sub);
     				} else {
     					for (String s : e.getValue()) {
     						final Subjects subjects = this.get(e.getKey());
     						subjects.add(s);
+    					}
+    				}
+    			}
+    		}
     		return this;
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
     import java.io.IOException;
     import java.util.List;
     import eu.dnetlib.data.mapreduce.util.DedupUtils;
     import eu.dnetlib.data.mapreduce.util.OafDecoder;
     import eu.dnetlib.data.proto.FieldTypeProtos.StringField;
     import eu.dnetlib.data.proto.ResultProtos;
     import org.apache.commons.lang.StringUtils;
     import org.apache.hadoop.hbase.client.Result;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     import org.apache.hadoop.io.NullWritable;
     /**
      * Created by claudio on 22/04/16.
      */
     public class PublicationAnalysisMapper extends TableMapper<NullWritable, NullWritable> {
     	public static final String RESULT = "result";
     	private static final int MAX_DESCRIPTIONS = 50;
     	@Override
     	protected void setup(final Context context) throws IOException, InterruptedException {
     		super.setup(context);
+    	}
     	@Override
     	protected void map(final ImmutableBytesWritable key, final Result value, final Context context) throws IOException, InterruptedException {
     		if (new String(key.copyBytes()).contains("dedup_wf")) {
     			context.getCounter(RESULT, "roots").increment(1);
     			return;
+    		}
     		final byte[] body = value.getValue(RESULT.getBytes(), DedupUtils.BODY_B);
     		if (body == null) {
     			context.getCounter(RESULT, "missing body").increment(1);
     			return;
+    		}
     		final OafDecoder decoder = OafDecoder.decode(body);
     		final ResultProtos.Result result = decoder.getEntity().getResult();
     		if (result.getMetadata().getResulttype().getClassid().equals("dataset")) {
     			context.getCounter(RESULT, "dataset").increment(1);
     			return;
     		} else {
     			context.getCounter(RESULT, "publication").increment(1);
+    		}
     		if (result.getMetadata().getDescriptionCount() > MAX_DESCRIPTIONS) {
     			context.getCounter(RESULT, "abstracts > " + MAX_DESCRIPTIONS).increment(1);
     		} else {
     			context.getCounter(RESULT, "abstracts: " + result.getMetadata().getDescriptionCount()).increment(1);
+    		}
     		final List<StringField> descList = result.getMetadata().getDescriptionList();
     		boolean empty = true;
     		for(StringField desc : descList) {
     			empty = empty && StringUtils.isBlank(desc.getValue());
+    		}
     		context.getCounter(RESULT, "empty abstract: " + empty).increment(1);
+    	}
     	@Override
     	protected void cleanup(final Context context) throws IOException, InterruptedException {
     		super.cleanup(context);
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
     import java.util.HashSet;
     import org.bson.BsonDocument;
     import org.bson.BsonDocumentWrapper;
     import org.bson.codecs.configuration.CodecRegistry;
     import org.bson.conversions.Bson;
     /**
      * Created by claudio on 07/03/16.
      */
     public class Subjects extends HashSet<String> {
+    }

     package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
     import java.io.IOException;
     import java.io.StringReader;
     import java.util.List;
     import java.util.Set;
     import com.google.common.base.Splitter;
     import com.google.common.collect.Iterables;
     import com.google.common.collect.Lists;
     import com.google.common.collect.Sets;
     import eu.dnetlib.data.mapreduce.JobParams;
     import eu.dnetlib.data.mapreduce.util.DedupUtils;
     import eu.dnetlib.data.mapreduce.util.OafDecoder;
     import eu.dnetlib.data.proto.PersonProtos;
     import eu.dnetlib.pace.config.DedupConfig;
     import eu.dnetlib.pace.model.Person;
     import org.apache.commons.lang.StringUtils;
     import org.apache.commons.lang.math.RandomUtils;
     import org.apache.hadoop.hbase.client.Put;
     import org.apache.hadoop.hbase.client.Result;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.hbase.mapreduce.TableMapper;
     import org.apache.hadoop.io.Text;
     import org.apache.hadoop.mapreduce.Mapper;
     import org.dom4j.Document;
     import org.dom4j.Element;
     import org.dom4j.io.SAXReader;
     public class JoinPersonGroupMapper extends Mapper<Text, Text, Text, Text> {
     	public static final String PERSON = "person";
     	private static final int MAX_TOKENS = 5;
     	private static final int MIN_FEATURES = 10;
     	private Text outKey;
     	private Text outValue;
     	private SubjectParser sp;
     	@Override
     	protected void setup(final Context context) throws IOException, InterruptedException {
     		outKey = new Text();
     		outValue = new Text();
     		sp = new SubjectParser();
+    	}
     	@Override
     	protected void map(final Text key, final Text value, final Context context) throws IOException, InterruptedException {
     		// System.out.println("got key: " + new String(keyIn.copyBytes()));
     		final SAXReader r = new SAXReader();
     		try {
     			final Document doc = r.read(new StringReader(value.toString()));
     			final SubjectsMap sm = sp.parse(doc);
     			final CsvEntry entry = new CsvEntry();
     			for(Subjects s : sm.values()) {
     				for(String subject : s) {
     					entry.addFeature(subject);
+    				}
+    			}
     			final List creatorNodes = doc.selectNodes("//*[local-name() = 'creator']");
     			final List<Person> authors = Lists.newArrayList();
     			for(int i = 0; i<creatorNodes.size(); i++) {
     				final Element e = (Element) creatorNodes.get(i);
     				authors.add(new Person(e.getText(), false));
+    			}
     			for(Person p1 : authors) {
     				context.getCounter(PERSON, "accurate " + p1.isAccurate()).increment(1);
     				final Set<String> hashes = getOutKeys(p1);
     				context.getCounter(PERSON, String.format("accurate %s keys", p1.isAccurate())).increment(hashes.size());
     				for(String s1 : hashes) {
     					//final String s1 = normalize(p1);
     					final CsvEntry c = new CsvEntry(s1, entry.getFeatures());
     					for (Person p2 : authors) {
     						final String s2 = normalize(p2.getSurnameString());
     						if (p1.isAccurate() && p2.isAccurate()) {
     							if (!p1.getSurnameString().equalsIgnoreCase(p2.getSurnameString())) {
     								c.addFeature(s2);
+    							}
+    						}
+    					}
     					c.getFeatures().remove(s1);
     					if (s1.length() <= 3) {
     						context.getCounter(PERSON, "key size <= 3").increment(1);
     						return;
+    					}
     					if(c.getFeatures().size() < MIN_FEATURES) {
     						context.getCounter(PERSON, "features < " + MIN_FEATURES).increment(1);
     						return;
+    					}
     					outKey.set(s1);
     					outValue.set(c.toString());
     					context.write(outKey, outValue);
+    				}
+    			}
     		} catch (final Throwable e) {
     			System.out.println("GOT EX " + e);
     			e.printStackTrace(System.err);
     			context.getCounter(PERSON, e.getClass().toString()).increment(1);
+    		}
+    	}
     	private Set<String> getOutKeys(final Person p1) {
     		final Set<String> hashes = Sets.newHashSet();
     		if (p1.isAccurate()) {
     			hashes.add(normalize(p1));
     		} else {
     			final String s = normalize(p1.getOriginal());
     			for (final String token1 : tokens(s)) {
     				for (final String token2 : tokens(s)) {
     					if (!token1.equals(token2)) {
     						hashes.add(firstLC(token1) + token2);
+    					}
+    				}
+    			}
+    		}
     		return hashes;
+    	}
     	private String normalize(final Person p) {
     		final String s = p.getSurnameString() + firstLC(p.getNameString());
     		return normalize(s);
+    	}
     	private String normalize(final String s) {
     		return s.replaceAll("[^a-zA-Z ]", "").toLowerCase().trim();
+    	}
     	private Iterable<String> tokens(final String s) {
     		return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), MAX_TOKENS);
+    	}
     	private String firstLC(final String s) {
     		return StringUtils.substring(s, 0, 1).toLowerCase();
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
     import java.util.List;
     import com.google.common.base.Splitter;
     import org.apache.commons.lang.StringUtils;
     import org.dom4j.Element;
     /**
      * Created by claudio on 25/03/16.
      */
     public class SubjectParser {
     	public static final String REGEX_SUBJECT = "^(info:eu-repo)\\/(classification)\\/([a-zA-Z]*)\\/(.*)$";
     	private static final int MIN_LENGTH = 5;
     	public SubjectsMap parse(final org.dom4j.Document doc) {
     		final List subjectNodes = doc.selectNodes("//*[local-name() = 'subject']");
     		final SubjectsMap subjectMap = new SubjectsMap();
     		for(int i = 0; i<subjectNodes.size(); i++) {
     			final Element e = (Element) subjectNodes.get(i);
     			final String subject = e.getText();
     			final String type = guessType(subject);
     			if (!subjectMap.containsKey(type)) {
     				subjectMap.put(type, new Subjects());
+    			}
     			if (StringUtils.isNotBlank(type)) {
     				if ("keyword".equals(type)) {
     					final Splitter splitter = Splitter.on(",").trimResults().omitEmptyStrings();
     					for (String token : splitter.split(subject)) {
     						final String value = token.replaceAll("[^a-zA-Z ]", "").toLowerCase();
     						if (value.length() >= MIN_LENGTH) {
     							subjectMap.get(type).add(value);
+    						}
+    					}
     				} else {
     					String token = subject.replaceFirst(REGEX_SUBJECT, "$4");
     					if (StringUtils.isNotBlank(token)) {
     						final String value = token.replaceAll("[^a-zA-Z ]", "").toLowerCase();
     						if (value.length() >= MIN_LENGTH) {
     							subjectMap.get(type).add(value);
+    						}
+    					}
+    				}
+    			}
+    		}
     		return subjectMap;
+    	}
     	private String guessType(final String subject) {
     		if (subject.startsWith("info:eu-repo")) {
     			final String s = subject.replaceAll(REGEX_SUBJECT, "$3");
     			return s;
     		} else {
     			return "keyword";
+    		}
+    	}
+    }

     import java.io.IOException;
     import java.util.List;
     import com.google.common.base.Function;
     import com.google.common.collect.Iterables;
     import eu.dnetlib.data.mapreduce.util.OafDecoder;
     import eu.dnetlib.data.proto.PersonProtos.Person;
     import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
     import org.apache.hadoop.io.NullWritable;
     import com.google.common.collect.Lists;
     import org.apache.commons.lang.StringUtils;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import org.apache.hadoop.io.Text;
     import org.apache.hadoop.mapreduce.Reducer;
     public class JoinPersonGroupReducer extends Reducer<ImmutableBytesWritable, ImmutableBytesWritable, NullWritable, NullWritable> {
     public class JoinPersonGroupReducer extends Reducer<Text, Text, Text, Text> {
     	/**
     	 * logger.
     	 */
     	private static final Log log = LogFactory.getLog(JoinPersonGroupReducer.class); // NOPMD by marko on 11/24/08 5:02 PM
     	private Text tKey;
     	private Text tValue;
     	private final static int MIN_ENTRIES_THRESHOLD = 100;
     	private int minEntriesThreshold;
     	@Override
     	protected void reduce(final ImmutableBytesWritable key, final Iterable<ImmutableBytesWritable> values, final Context context) throws IOException, InterruptedException {
     	protected void setup(final Context context) throws IOException, InterruptedException {
     		super.setup(context);
     		tKey = new Text("");
     		tValue = new Text();
     		final Iterable<OafDecoder> decoders = Iterables.transform(values, new Function<ImmutableBytesWritable, OafDecoder>() {
     			@Override
     			public OafDecoder apply(final ImmutableBytesWritable ibw) {
     				return OafDecoder.decode(ibw.copyBytes());
+    			}
     		});
     		minEntriesThreshold = context.getConfiguration().getInt("min.entries.threshold", MIN_ENTRIES_THRESHOLD);
+    	}
     		final Iterable<Person> persons = Iterables.transform(decoders, new Function<OafDecoder, Person>() {
     	@Override
     	protected void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException {
     		final CsvSerialiser csvSerialiser = new CsvSerialiser();
     		String outKey = key.toString().replaceAll("[^a-zA-Z ]", "").toLowerCase();
     		if (StringUtils.isBlank(outKey)) {
     			context.getCounter("person", "blank key").increment(1);
     			return;
+    		}
     		final List<CsvEntry> entries = Lists.newArrayList(Iterables.transform(values, new Function<Text, CsvEntry>() {
     			@Override
     			public Person apply(final OafDecoder d) {
     				return d.getEntity().getPerson();
     			public CsvEntry apply(final Text t) {
     				return CsvEntry.fromJson(t.toString());
+    			}
     		});
     		}));
     		int i = 0;
     		for (final Person p : persons) {
     			i++;
     		trackPersonInfo(entries.size(), context, "person");
     		if (entries.size() < minEntriesThreshold) {
     			return;
+    		}
     		//tKey.set(outKey);
     		tValue.set(csvSerialiser.asCSV(entries));
     		context.write(tKey, tValue);
     		context.getCounter("person", "csv").increment(1);
+    	}
     	private void trackPersonInfo(final int count, final Context context, final String counterName) {
     		if (count > 0 && count <= 10) {
     			context.getCounter(counterName, count + "").increment(1);
     			return;
+    		}
     		if (count > 10 && count <= 20) {
     			context.getCounter(counterName, "[10, 20)").increment(1);
     			return;
+    		}
     		if (count > 20 && count <= 30) {
     			context.getCounter(counterName, "[20, 30)").increment(1);
     			return;
+    		}
     		if (count > 30 && count <= 40) {
     			context.getCounter(counterName, "[30, 40)").increment(1);
     			return;
+    		}
     		if (count > 40 && count <= 50) {
     			context.getCounter(counterName, "[40, 50)").increment(1);
     			return;
+    		}
     		if (count > 50 && count <= 70) {
     			context.getCounter(counterName, "[50, 70)").increment(1);
     			return;
+    		}
     		if (count > 70 && count <= 100) {
     			context.getCounter(counterName, "[70, 100)").increment(1);
     			return;
+    		}
     		if (count > 100) {
     			context.getCounter(counterName, "[100, *)").increment(1);
     			return;
+    		}
+    	}
     	@Override
     	public void cleanup(final Context context) throws IOException, InterruptedException {
     		super.cleanup(context);
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
     import java.util.Set;
     import com.google.common.collect.Sets;
     import com.google.gson.Gson;
     /**
      * Created by claudio on 20/04/16.
      */
     public class CsvEntry {
     	private String key;
     	private Set<String> features = Sets.newLinkedHashSet();
     	public CsvEntry() {
+    	}
     	public CsvEntry(final String key, final Set<String> features) {
     		this.key = key;
     		this.features = features;
+    	}
     	public CsvEntry(final Set<String> features) {
     		this.features = features;
+    	}
     	public void addFeature(final String f) {
     		getFeatures().add(f);
+    	}
     	public Set<String> getFeatures() {
     		return features;
+    	}
     	public void setFeatures(final Set<String> features) {
     		this.features = features;
+    	}
     	public static CsvEntry fromJson(final String json) {
     		return new Gson().fromJson(json, CsvEntry.class);
+    	}
     	public String getKey() {
     		return key;
+    	}
     	public void setKey(final String key) {
     		this.key = key;
+    	}
     	@Override
     	public String toString() {
     		return new Gson().toJson(this);
+    	}
     	@Override
     	public boolean equals(final Object o) {
     		return (o instanceof CsvEntry) && ((CsvEntry) o).getFeatures().equals(getFeatures());
+    	}
+    }

     package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;
     import java.io.StringWriter;
     import java.util.List;
     import java.util.Set;
     import com.google.common.base.Joiner;
     import com.google.common.collect.Iterables;
     import com.google.common.collect.Sets;
     import org.apache.commons.lang.StringUtils;
     /**
      * Created by claudio on 26/04/16.
      */
     public class CsvSerialiser {
     	private final static int MAX_FEATURES = 100;
     	private final static int MAX_ROWS = 1000;
     	private int maxRows = MAX_ROWS;
     	private int maxFeatures = MAX_FEATURES;
     	public CsvSerialiser() {
+    	}
     	public CsvSerialiser(int maxRows, int maxFeatures) {
     		this.maxRows = maxRows;
     		this.maxFeatures = maxFeatures;
+    	}
     	public String asCSV(final List<CsvEntry> list) {
     		final Set<String> features = Sets.newLinkedHashSet();
     		for(CsvEntry e : Iterables.limit(list, maxRows)) {
     			features.addAll(e.getFeatures());
+    		}
     		final Iterable<String> cappedFeatures = Iterables.limit(features, maxFeatures);
     		//context.getCounter("person", "features " + Iterables.size(cappedFeatures)).increment(1);
     		final StringWriter csv = new StringWriter();
     		csv.append("k,");
     		csv.append(Joiner.on(",").join(cappedFeatures) + "\n");
     		for(CsvEntry e : Iterables.limit(list, maxRows)) {
     			final StringWriter line = new StringWriter();
     			line.append(e.getKey()+",");
     			for(String f : cappedFeatures) {
     				if(e.getFeatures().contains(f)) {
     					line.append("1,");
     				} else {
     					line.append("0,");
+    				}
+    			}
     			csv.append(StringUtils.substringBeforeLast(line.toString(), ",")  + "\n");
+    		}
     		return csv.toString();
+    	}
+    }

Project

General

Profile

D-Net

Revision 42382

Added by Claudio Atzori about 8 years ago