1
|
package eu.dnetlib.data.collector.plugins.projects.gtr2;
|
2
|
|
3
|
import java.io.ByteArrayOutputStream;
|
4
|
import java.io.StringWriter;
|
5
|
import java.util.concurrent.*;
|
6
|
|
7
|
import com.ximpleware.AutoPilot;
|
8
|
import com.ximpleware.VTDGen;
|
9
|
import com.ximpleware.VTDNav;
|
10
|
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
11
|
import org.apache.commons.logging.Log;
|
12
|
import org.apache.commons.logging.LogFactory;
|
13
|
import org.apache.commons.lang3.*;
|
14
|
|
15
|
public class Gtr2Helper {
|
16
|
|
17
|
private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM
|
18
|
|
19
|
private VTDNav mainVTDNav;
|
20
|
private AutoPilot mainAutoPilot;
|
21
|
private StringWriter writer;
|
22
|
private HttpConnector connector;
|
23
|
//private BlockingQueue<String> fragment = new ArrayBlockingQueue<String>(20);
|
24
|
|
25
|
public String processProject(final VTDNav vn, final String namespaces) throws Exception {
|
26
|
//log.debug("Processing project at "+projectURL);
|
27
|
writer = new StringWriter();
|
28
|
mainVTDNav = vn;
|
29
|
mainAutoPilot = new AutoPilot(mainVTDNav);
|
30
|
writer.write("<doc " + namespaces + ">");
|
31
|
writeFragment(mainVTDNav);
|
32
|
|
33
|
mainAutoPilot.selectXPath("//link[@rel='FUND']");
|
34
|
ExecutorService es = Executors.newFixedThreadPool(5);
|
35
|
|
36
|
while (mainAutoPilot.evalXPath() != -1) {
|
37
|
Thread t = new Thread(new ProcessFunder(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
|
38
|
es.execute(t);
|
39
|
}
|
40
|
|
41
|
mainAutoPilot.resetXPath();
|
42
|
mainAutoPilot.selectXPath(".//link[@rel='LEAD_ORG']");
|
43
|
while (mainAutoPilot.evalXPath() != -1) {
|
44
|
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
|
45
|
new String[] { "<ld-org>", "</ld-org>" }));
|
46
|
es.execute(t);
|
47
|
}
|
48
|
mainAutoPilot.resetXPath();
|
49
|
mainAutoPilot.selectXPath(".//link[@rel='PP_ORG']");
|
50
|
while (mainAutoPilot.evalXPath() != -1) {
|
51
|
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
|
52
|
new String[] { "<pp-org>","</pp-org>" }));
|
53
|
es.execute(t);
|
54
|
}
|
55
|
mainAutoPilot.resetXPath();
|
56
|
|
57
|
mainAutoPilot.selectXPath(".//link[@rel='PI_PER']");
|
58
|
while (mainAutoPilot.evalXPath() != -1) {
|
59
|
Thread t = new Thread(new PiPer(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
|
60
|
es.execute(t);
|
61
|
}
|
62
|
es.shutdown();
|
63
|
log.debug("Waiting threads");
|
64
|
es.awaitTermination(10, TimeUnit.MINUTES);
|
65
|
|
66
|
log.debug("Finished writing project");
|
67
|
writer.write("</doc>");
|
68
|
writer.close();
|
69
|
|
70
|
return writer.toString();
|
71
|
}
|
72
|
|
73
|
private VTDNav setNavigator(final String httpUrl) {
|
74
|
VTDGen vg_tmp = new VTDGen();
|
75
|
connector = new HttpConnector();
|
76
|
try {
|
77
|
byte[] bytes = connector.getInputSource(httpUrl).getBytes("UTF-8");
|
78
|
vg_tmp.setDoc(bytes);
|
79
|
vg_tmp.parse(false);
|
80
|
//vg_tmp.parseHttpUrl(httpUrl, false);
|
81
|
return vg_tmp.getNav();
|
82
|
}catch (Throwable e){
|
83
|
return null;
|
84
|
}
|
85
|
}
|
86
|
|
87
|
private int evalXpath(final VTDNav fragmentVTDNav, final String xPath) throws Exception {
|
88
|
|
89
|
AutoPilot ap_tmp = new AutoPilot(fragmentVTDNav);
|
90
|
ap_tmp.selectXPath(xPath);
|
91
|
return ap_tmp.evalXPath();
|
92
|
}
|
93
|
|
94
|
private void writeFragment(final VTDNav nav) throws Exception {
|
95
|
ByteArrayOutputStream b = new ByteArrayOutputStream();
|
96
|
nav.dumpFragment(b);
|
97
|
String ret = b.toString();
|
98
|
b.reset();
|
99
|
writer.write(ret);
|
100
|
}
|
101
|
|
102
|
private void writeNewTagAndInfo(final VTDNav vn, final String xPath, final String xmlOpenTag, final String xmlCloseTag, final String attrName) throws Exception {
|
103
|
|
104
|
int nav_res = evalXpath(vn, xPath);
|
105
|
if (nav_res != -1) {
|
106
|
String tmp = xmlOpenTag;
|
107
|
if (attrName != null) tmp += (vn.toNormalizedString(vn.getAttrVal(attrName)));
|
108
|
else
|
109
|
tmp += (StringEscapeUtils.escapeXml11(vn.toNormalizedString(vn.getText())));
|
110
|
tmp += (xmlCloseTag);
|
111
|
writer.write(tmp);
|
112
|
}
|
113
|
}
|
114
|
|
115
|
private class PiPer implements Runnable {
|
116
|
|
117
|
private VTDNav vn;
|
118
|
|
119
|
public PiPer(String httpURL) {
|
120
|
vn = setNavigator(httpURL);
|
121
|
}
|
122
|
|
123
|
@Override
|
124
|
public void run() {
|
125
|
try {
|
126
|
writeFragment(vn);
|
127
|
} catch (Throwable e) {log.debug("Eccezione in PiPer " + e.getMessage());}
|
128
|
|
129
|
}
|
130
|
}
|
131
|
|
132
|
private class Org implements Runnable {
|
133
|
|
134
|
private String[] tags;
|
135
|
private VTDNav vn;
|
136
|
|
137
|
public Org(final String httpURL, final String[] tags) {
|
138
|
vn = setNavigator(httpURL);
|
139
|
this.tags = tags;
|
140
|
}
|
141
|
|
142
|
@Override
|
143
|
public void run() {
|
144
|
try {
|
145
|
writeNewTagAndInfo(vn, "//name", tags[0]+"<name>", "</name>", null);
|
146
|
vn.toElement(VTDNav.ROOT);
|
147
|
writeNewTagAndInfo(vn, "//country", "<country>", "</country>", null);
|
148
|
vn.toElement(VTDNav.ROOT);
|
149
|
writeNewTagAndInfo(vn, ".", "<id>", "</id>"+tags[1], "id");
|
150
|
} catch (Throwable e) {
|
151
|
log.debug("Eccezione in Org " + e.getMessage());
|
152
|
}
|
153
|
}
|
154
|
|
155
|
}
|
156
|
|
157
|
private class ProcessFunder implements Runnable {
|
158
|
|
159
|
private VTDNav vn;
|
160
|
|
161
|
public ProcessFunder(final String httpURL) {
|
162
|
vn = setNavigator(httpURL);
|
163
|
}
|
164
|
|
165
|
@Override
|
166
|
public void run() {
|
167
|
|
168
|
try {
|
169
|
AutoPilot ap = new AutoPilot(vn);
|
170
|
writeFragment(vn);
|
171
|
ap.selectXPath(".//link[@rel='FUNDER']");
|
172
|
VTDNav tmp_vn;
|
173
|
while (ap.evalXPath() != -1) {
|
174
|
tmp_vn = setNavigator(vn.toNormalizedString(vn.getAttrVal("href")));
|
175
|
writeNewTagAndInfo(tmp_vn, "//name", "<funder> <name>", "</name></funder>", null);
|
176
|
}
|
177
|
} catch (Throwable e) {log.debug("Eccezione in Funder" + e.getMessage());}
|
178
|
}
|
179
|
|
180
|
}
|
181
|
}
|