1
|
package eu.dnetlib.data.collector.plugins.datasets;
|
2
|
|
3
|
import java.io.IOException;
|
4
|
import java.io.InputStream;
|
5
|
import java.util.Iterator;
|
6
|
|
7
|
import org.apache.commons.io.IOUtils;
|
8
|
import org.apache.commons.lang3.StringEscapeUtils;
|
9
|
import org.apache.commons.logging.Log;
|
10
|
import org.apache.commons.logging.LogFactory;
|
11
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
12
|
import org.apache.http.client.methods.HttpPost;
|
13
|
import org.apache.http.entity.StringEntity;
|
14
|
import org.apache.http.impl.client.CloseableHttpClient;
|
15
|
import org.apache.http.impl.client.HttpClients;
|
16
|
|
17
|
import com.google.gson.Gson;
|
18
|
import com.google.gson.GsonBuilder;
|
19
|
|
20
|
/**
|
21
|
* The Class JournalIterator.
|
22
|
*/
|
23
|
public class DatasetsIterator implements Iterable<String>, Iterator<String> {
|
24
|
|
25
|
/** The logger. */
|
26
|
private static final Log log = LogFactory.getLog(DatasetsIterator.class);
|
27
|
|
28
|
/** The base url template. */
|
29
|
private static String BASE_URL_TEMPLATE = "http://ws.pangaea.de/es/pangaea/panmd/_search?_source=xml&size=%d&from=%d";
|
30
|
|
31
|
/** The journal id. */
|
32
|
private String journalId = "";
|
33
|
|
34
|
/** The journal name. */
|
35
|
private String journalName = "";
|
36
|
|
37
|
/** The journal issn. */
|
38
|
private String journalISSN = "";
|
39
|
|
40
|
/** The openaire datasource. */
|
41
|
private String openaireDatasource = "";
|
42
|
|
43
|
/** The total. */
|
44
|
private long total;
|
45
|
|
46
|
/** The from. */
|
47
|
private int from;
|
48
|
|
49
|
/** The current iterator. */
|
50
|
private int currentIterator;
|
51
|
|
52
|
/** The current response. */
|
53
|
private ElasticSearchResponse currentResponse;
|
54
|
|
55
|
/** The request. */
|
56
|
private RequestField request;
|
57
|
|
58
|
/** The default size. */
|
59
|
private static int DEFAULT_SIZE = 10;
|
60
|
|
61
|
private String projectCordaId;
|
62
|
|
63
|
private static String RECORD_TEMPLATE = "<datasetsRecord><oaf:projectid xmlns:oaf=\"http://namespace.openaire.eu/oaf\">%s</oaf:projectid>"
|
64
|
+ "<journal name='%s' issn='%s' datasourceid = '%s'/><metadata>%s</metadata></datasetsRecord>";
|
65
|
|
66
|
/**
|
67
|
* Instantiates a new journal iterator.
|
68
|
*
|
69
|
* @param request
|
70
|
* the request
|
71
|
*/
|
72
|
public DatasetsIterator(final RequestField request, final String projectCordaId, final PangaeaJournalInfo info) {
|
73
|
this.request = request;
|
74
|
this.setProjectCordaId(projectCordaId);
|
75
|
|
76
|
if (info != null) {
|
77
|
this.setJournalId(info.getJournalId());
|
78
|
this.setJournalName(StringEscapeUtils.escapeXml(info.getJournalName()));
|
79
|
this.setJournalISSN(info.getJournalISSN());
|
80
|
this.setOpenaireDatasource(info.getDatasourceId());
|
81
|
}
|
82
|
log.debug("Start Iterator");
|
83
|
}
|
84
|
|
85
|
/**
|
86
|
* Execute query.
|
87
|
*
|
88
|
* @param from
|
89
|
* the from
|
90
|
* @param size
|
91
|
* the size
|
92
|
* @return the string
|
93
|
*/
|
94
|
private String executeQuery(final int from, final int size) {
|
95
|
log.debug("executing query " + this.request.getQuery().getTerm());
|
96
|
log.debug(String.format("from:%d size:%d", from, size));
|
97
|
CloseableHttpResponse response = null;
|
98
|
InputStream responseBody = null;
|
99
|
CloseableHttpClient httpclient = HttpClients.createDefault();
|
100
|
try {
|
101
|
|
102
|
HttpPost post = new HttpPost(String.format(BASE_URL_TEMPLATE, size, from));
|
103
|
Gson g = new GsonBuilder().disableHtmlEscaping().create();
|
104
|
StringEntity entry = new StringEntity(g.toJson(this.request));
|
105
|
post.setEntity(entry);
|
106
|
long start = System.currentTimeMillis();
|
107
|
response = httpclient.execute(post);
|
108
|
int statusCode = response.getStatusLine().getStatusCode();
|
109
|
if (statusCode == 200) {
|
110
|
responseBody = response.getEntity().getContent();
|
111
|
String s = IOUtils.toString(responseBody);
|
112
|
log.debug("Request done in " + (System.currentTimeMillis() - start) + " ms");
|
113
|
responseBody.close();
|
114
|
return s;
|
115
|
}
|
116
|
return null;
|
117
|
} catch (Exception e) {
|
118
|
log.error("Error on executing query :" + request.getQuery().getTerm(), e);
|
119
|
return null;
|
120
|
} finally {
|
121
|
try {
|
122
|
responseBody.close();
|
123
|
response.close();
|
124
|
httpclient.close();
|
125
|
} catch (IOException e) {
|
126
|
log.error("Can't close connections gracefully", e);
|
127
|
}
|
128
|
}
|
129
|
|
130
|
}
|
131
|
|
132
|
/**
|
133
|
* Gets the journal id.
|
134
|
*
|
135
|
* @return the journalId
|
136
|
*/
|
137
|
public String getJournalId() {
|
138
|
return journalId;
|
139
|
}
|
140
|
|
141
|
/**
|
142
|
* Sets the journal id.
|
143
|
*
|
144
|
* @param journalId
|
145
|
* the journalId to set
|
146
|
*/
|
147
|
public void setJournalId(final String journalId) {
|
148
|
this.journalId = journalId;
|
149
|
}
|
150
|
|
151
|
/*
|
152
|
* (non-Javadoc)
|
153
|
*
|
154
|
* @see java.util.Iterator#hasNext()
|
155
|
*/
|
156
|
@Override
|
157
|
public boolean hasNext() {
|
158
|
return (from + currentIterator) < total;
|
159
|
}
|
160
|
|
161
|
/*
|
162
|
* (non-Javadoc)
|
163
|
*
|
164
|
* @see java.util.Iterator#next()
|
165
|
*/
|
166
|
@Override
|
167
|
public String next() {
|
168
|
String xml = String.format(RECORD_TEMPLATE, this.projectCordaId, this.journalName, this.journalISSN, this.openaireDatasource, currentResponse
|
169
|
.getXmlRecords().get(currentIterator));
|
170
|
currentIterator++;
|
171
|
if (currentIterator == DEFAULT_SIZE) {
|
172
|
getNextItem();
|
173
|
}
|
174
|
return xml;
|
175
|
}
|
176
|
|
177
|
/*
|
178
|
* (non-Javadoc)
|
179
|
*
|
180
|
* @see java.util.Iterator#remove()
|
181
|
*/
|
182
|
@Override
|
183
|
public void remove() {
|
184
|
throw new UnsupportedOperationException();
|
185
|
|
186
|
}
|
187
|
|
188
|
/*
|
189
|
* (non-Javadoc)
|
190
|
*
|
191
|
* @see java.lang.Iterable#iterator()
|
192
|
*/
|
193
|
@Override
|
194
|
public Iterator<String> iterator() {
|
195
|
from = 0;
|
196
|
total = 0;
|
197
|
getNextItem();
|
198
|
return this;
|
199
|
}
|
200
|
|
201
|
/**
|
202
|
* Gets the next item.
|
203
|
*
|
204
|
* @return the next item
|
205
|
*/
|
206
|
private void getNextItem() {
|
207
|
from += currentIterator;
|
208
|
currentResponse = ElasticSearchResponse.createNewResponse(executeQuery(from, DEFAULT_SIZE));
|
209
|
total = currentResponse == null ? 0 : currentResponse.getTotal();
|
210
|
log.debug("from : " + from + " total of the request is " + total);
|
211
|
currentIterator = 0;
|
212
|
}
|
213
|
|
214
|
/**
|
215
|
* @return the projectCordaId
|
216
|
*/
|
217
|
public String getProjectCordaId() {
|
218
|
return projectCordaId;
|
219
|
}
|
220
|
|
221
|
/**
|
222
|
* @param projectCordaId
|
223
|
* the projectCordaId to set
|
224
|
*/
|
225
|
public void setProjectCordaId(final String projectCordaId) {
|
226
|
this.projectCordaId = projectCordaId;
|
227
|
}
|
228
|
|
229
|
/**
|
230
|
* @return the journalName
|
231
|
*/
|
232
|
public String getJournalName() {
|
233
|
return journalName;
|
234
|
}
|
235
|
|
236
|
/**
|
237
|
* @param journalName
|
238
|
* the journalName to set
|
239
|
*/
|
240
|
public void setJournalName(final String journalName) {
|
241
|
this.journalName = journalName;
|
242
|
}
|
243
|
|
244
|
/**
|
245
|
* @return the journalISSN
|
246
|
*/
|
247
|
public String getJournalISSN() {
|
248
|
return journalISSN;
|
249
|
}
|
250
|
|
251
|
/**
|
252
|
* @param journalISSN
|
253
|
* the journalISSN to set
|
254
|
*/
|
255
|
public void setJournalISSN(final String journalISSN) {
|
256
|
this.journalISSN = journalISSN;
|
257
|
}
|
258
|
|
259
|
/**
|
260
|
* @return the openaireDatasource
|
261
|
*/
|
262
|
public String getOpenaireDatasource() {
|
263
|
return openaireDatasource;
|
264
|
}
|
265
|
|
266
|
/**
|
267
|
* @param openaireDatasource
|
268
|
* the openaireDatasource to set
|
269
|
*/
|
270
|
public void setOpenaireDatasource(final String openaireDatasource) {
|
271
|
this.openaireDatasource = openaireDatasource;
|
272
|
}
|
273
|
|
274
|
}
|