Project

General

Profile

1
package eu.dnetlib.data.mapreduce.dedup;
2

    
3
import java.util.LinkedList;
4
import java.util.List;
5
import java.util.Map;
6
import java.util.PriorityQueue;
7
import java.util.Queue;
8
import java.util.UUID;
9

    
10
import eu.dnetlib.data.mapreduce.hbase.dedup.kv.DNGFKey;
11
import eu.dnetlib.data.mapreduce.hbase.dli.kv.DliKey;
12
import org.apache.commons.io.IOUtils;
13
import org.junit.Assert;
14
import org.junit.Before;
15
import org.junit.Test;
16
import org.springframework.core.io.ClassPathResource;
17

    
18
import com.google.common.collect.Lists;
19
import com.google.common.collect.Maps;
20

    
21
import eu.dnetlib.pace.clustering.NGramUtils;
22
import eu.dnetlib.pace.config.Type;
23
import eu.dnetlib.pace.model.Field;
24
import eu.dnetlib.pace.model.FieldListImpl;
25
import eu.dnetlib.pace.model.FieldValueImpl;
26
import eu.dnetlib.pace.model.MapDocument;
27
import eu.dnetlib.pace.model.MapDocumentComparator;
28

    
29
public class TitleOrderingTest {
30

    
31
	private List<MapDocument> results = Lists.newArrayList();
32

    
33
	@Before
34
	public void setUp() throws Exception {
35

    
36
		final List<String> lines = IOUtils.readLines(new ClassPathResource("eu/dnetlib/data/mapreduce/dedup/titles.txt").getInputStream());
37
		for (final String title : lines) {
38
			final Map<String, Field> fieldMap = Maps.newHashMap();
39
			final FieldListImpl list = new FieldListImpl();
40
			list.add(new FieldValueImpl(Type.String, "title", title));
41
			fieldMap.put("title", list);
42
			results.add(new MapDocument("id-" + UUID.randomUUID(), fieldMap));
43
		}
44
	}
45

    
46
	@Test
47
	public void test() {
48

    
49
		final Queue<MapDocument> queue = new PriorityQueue<MapDocument>(100, new MapDocumentComparator("title"));
50

    
51
		queue.addAll(results);
52

    
53
		final Queue<MapDocument> queue2 = simplifyQueue(queue);
54

    
55
		while (!queue2.isEmpty()) {
56
			final MapDocument doc = queue2.remove();
57
			System.out.println(doc.values("title").stringValue());
58
		}
59
	}
60

    
61
	private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue) {
62
		final Queue<MapDocument> q = new LinkedList<MapDocument>();
63

    
64
		String fieldRef = "";
65
		final List<MapDocument> tempResults = Lists.newArrayList();
66

    
67
		while (!queue.isEmpty()) {
68
			final MapDocument result = queue.remove();
69

    
70
			if (!result.values("title").isEmpty()) {
71
				final String field = NGramUtils.cleanupForOrdering(result.values("title").stringValue());
72
				if (field.equals(fieldRef)) {
73
					tempResults.add(result);
74
				} else {
75
					if (tempResults.size() < 5) {
76
						q.addAll(tempResults);
77
					} else {
78
						System.out.println("Skipped field: " + fieldRef + " - size: " + tempResults.size());
79
					}
80
					tempResults.clear();
81
					tempResults.add(result);
82
					fieldRef = field;
83
				}
84
			}
85
		}
86
		if (tempResults.size() < 5) {
87
			q.addAll(tempResults);
88
		} else {
89
			System.out.println("Skipped field: " + fieldRef + " - size: " + tempResults.size());
90
		}
91

    
92
		return q;
93
	}
94

    
95

    
96
    @Test
97
    public void compareDLIKey() {
98

    
99
        DNGFKey k1 = DNGFKey.mergesRel("a");
100
        DNGFKey k2 = DNGFKey.otherRel("a");
101

    
102
        Assert.assertEquals(-1, k1.compareTo(k2));
103
        Assert.assertEquals(0, k1.compareTo(k1));
104
        Assert.assertEquals(1, k2.compareTo(k1));
105

    
106

    
107
    }
108

    
109
}
(2-2/2)