1
|
package eu.dnetlib.iis.documentssimilarity.producer;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.util.HashMap;
|
5
|
import java.util.List;
|
6
|
import java.util.Map;
|
7
|
import java.util.Random;
|
8
|
|
9
|
import org.apache.hadoop.conf.Configuration;
|
10
|
import org.apache.hadoop.fs.FileSystem;
|
11
|
import org.apache.hadoop.fs.Path;
|
12
|
|
13
|
import com.google.common.collect.Lists;
|
14
|
|
15
|
import de.svenjacobs.loremipsum.LoremIpsum;
|
16
|
import eu.dnetlib.iis.core.java.PortBindings;
|
17
|
import eu.dnetlib.iis.core.java.Process;
|
18
|
import eu.dnetlib.iis.core.java.io.DataStore;
|
19
|
import eu.dnetlib.iis.core.java.io.FileSystemPath;
|
20
|
import eu.dnetlib.iis.core.java.porttype.AvroPortType;
|
21
|
import eu.dnetlib.iis.core.java.porttype.PortType;
|
22
|
import eu.dnetlib.iis.documentssimilarity.schemas.DocumentMetadata;
|
23
|
import eu.dnetlib.iis.importer.schemas.Person;
|
24
|
|
25
|
/**
|
26
|
* Produce data stores
|
27
|
*
|
28
|
* @author Mateusz Fedoryszak
|
29
|
*/
|
30
|
public class DocumentAvroDatastoreProducer implements Process {
|
31
|
|
32
|
private final static String documentPort = "document";
|
33
|
|
34
|
public Map<String, PortType> getInputPorts() {
|
35
|
return new HashMap<String, PortType>();
|
36
|
}
|
37
|
|
38
|
@Override
|
39
|
public Map<String, PortType> getOutputPorts() {
|
40
|
return createOutputPorts();
|
41
|
}
|
42
|
|
43
|
private static Map<String, PortType> createOutputPorts() {
|
44
|
HashMap<String, PortType> outputPorts =
|
45
|
new HashMap<String, PortType>();
|
46
|
outputPorts.put(documentPort,
|
47
|
new AvroPortType(DocumentMetadata.SCHEMA$));
|
48
|
return outputPorts;
|
49
|
}
|
50
|
|
51
|
public static Person createPerson(String id, String name) {
|
52
|
return new Person(id, null, null, name);
|
53
|
}
|
54
|
|
55
|
public static List<DocumentMetadata> getDocumentMetadataList() {
|
56
|
DocumentMetadata doc1 = new DocumentMetadata();
|
57
|
doc1.setId("1");
|
58
|
doc1.setTitle("A new method of something");
|
59
|
doc1.setAbstract$("We present a new method of doing something. We are not sure yet what " +
|
60
|
"it is actually doing, but it definitely is a worthwhile technique.");
|
61
|
doc1.setKeywords(Lists.<CharSequence>newArrayList("method", "something", "nothing", "anything"));
|
62
|
doc1.setAuthors(Lists.<Person>newArrayList(createPerson("1", "Jan Kowalski")));
|
63
|
|
64
|
DocumentMetadata doc2 = new DocumentMetadata();
|
65
|
doc2.setId("2");
|
66
|
doc2.setTitle("How to do it?");
|
67
|
doc2.setAbstract$("We are asking some of fundamental engineering questions here. As all " +
|
68
|
"kinds of fundamental questions, they probably have no answers.");
|
69
|
doc2.setKeywords(Lists.<CharSequence>newArrayList(
|
70
|
"doing things", "questioning", "falsificationism", "epistemology"));
|
71
|
doc2.setAuthors(Lists.<Person>newArrayList(createPerson("1", "Jan Kowalski"), createPerson("2", "Zygmunt Nowak")));
|
72
|
|
73
|
DocumentMetadata doc3 = new DocumentMetadata();
|
74
|
doc3.setId("3");
|
75
|
doc3.setTitle("Our great tool");
|
76
|
doc3.setAbstract$("In this paper we present Our great tool that is capable of doing " +
|
77
|
"anything. First theoretical studies have shown its great potential. Practical " +
|
78
|
"applications are to be investigated in the future.");
|
79
|
doc3.setKeywords(Lists.<CharSequence>newArrayList(
|
80
|
"Our great tool", "perpetuum mobile", "stop problem", "P==NP?"));
|
81
|
doc3.setAuthors(Lists.<Person>newArrayList(createPerson("2", "Zygmunt Nowak")));
|
82
|
|
83
|
DocumentMetadata doc4 = new DocumentMetadata();
|
84
|
doc4.setId("4");
|
85
|
doc4.setTitle("Big and great system");
|
86
|
doc4.setAbstract$("Worldwide amount of data is growing every year. That is why ever " +
|
87
|
"bigger and greater systems needs to be built. In this paper we present our biggest " +
|
88
|
"and greatest system so far.");
|
89
|
doc4.setKeywords(Lists.<CharSequence>newArrayList(
|
90
|
"big", "enormous", "great", "grand"));
|
91
|
doc4.setAuthors(Lists.<Person>newArrayList(createPerson("2", "Zygmunt Nowak"), createPerson("1", "Jan Kowalski")));
|
92
|
|
93
|
List<DocumentMetadata> results = Lists.newArrayList(doc1, doc2, doc3, doc4);
|
94
|
|
95
|
// adding dummy records up to 10 in total
|
96
|
LoremIpsum loremIpsum = new LoremIpsum();
|
97
|
Random rand = new Random();
|
98
|
for(int i = 5; i<=200; i++) {
|
99
|
DocumentMetadata doc = new DocumentMetadata();
|
100
|
doc.setId(Integer.toString(i));
|
101
|
doc.setTitle(loremIpsum.getWords(10, rand.nextInt(50)));
|
102
|
doc.setAbstract$(loremIpsum.getWords(50, rand.nextInt(50)));
|
103
|
results.add(doc);
|
104
|
}
|
105
|
|
106
|
return results;
|
107
|
}
|
108
|
|
109
|
@Override
|
110
|
public void run(PortBindings portBindings, Configuration conf,
|
111
|
Map<String, String> parameters) throws IOException {
|
112
|
Map<String, Path> output = portBindings.getOutput();
|
113
|
FileSystem fs = FileSystem.get(conf);
|
114
|
|
115
|
DataStore.create(getDocumentMetadataList(),
|
116
|
new FileSystemPath(fs, output.get(documentPort)));
|
117
|
}
|
118
|
|
119
|
public static void main(String[] args) {
|
120
|
Random rand = new Random();
|
121
|
LoremIpsum loremIpsum = new LoremIpsum();
|
122
|
System.out.println(loremIpsum.getWords(100));
|
123
|
|
124
|
}
|
125
|
}
|