Revision 61369
Added by Sandro La Bruzzo almost 3 years ago
es_connector.py | ||
---|---|---|
10 | 10 |
import os |
11 | 11 |
from os import path |
12 | 12 |
|
13 |
|
|
14 | 13 |
log = logging.getLogger("scholexplorer-portal") |
15 | 14 |
|
16 | 15 |
pid_resolver = { |
... | ... | |
43 | 42 |
def resolveIdentifier(pid, pid_type): |
44 | 43 |
if pid_type != None: |
45 | 44 |
regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b" |
46 |
if re.match(regex,pid): |
|
45 |
if re.match(regex, pid):
|
|
47 | 46 |
log.debug("It should be doi") |
48 | 47 |
pid_type = 'doi' |
49 | 48 |
if pid_type.lower() in pid_resolver: |
... | ... | |
52 | 51 |
if pid_type.lower() == 'openaire': |
53 | 52 |
return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '') |
54 | 53 |
elif pid_type.lower() == 'url': |
55 |
return pid
|
|
54 |
return pid |
|
56 | 55 |
else: |
57 | 56 |
return "http://identifiers.org/%s:%s" % (pid_type, pid) |
58 | 57 |
return "" |
59 | 58 |
|
60 | 59 |
|
61 |
|
|
62 |
|
|
63 |
|
|
64 | 60 |
def create_typology_filter(value): |
65 | 61 |
return Q('match', typology=value) |
66 | 62 |
|
... | ... | |
106 | 102 |
self.index_name = props['api.index'] |
107 | 103 |
|
108 | 104 |
def get_main_page_stats(self): |
109 |
stats = dict(total =int(Search(using=self.client, index=self.index_name+"_scholix").count()/2))
|
|
105 |
stats = dict(total=int(Search(using=self.client, index=self.index_name + "_scholix").count() / 2))
|
|
110 | 106 |
for item in ['dataset', 'publication']: |
111 |
s= Search(using=self.client, index=self.index_name+"_object").query(Q('match', typology=item))
|
|
107 |
s = Search(using=self.client, index=self.index_name + "_object").query(Q('match', typology=item))
|
|
112 | 108 |
stats[item] = s.count() |
113 | 109 |
return stats |
114 | 110 |
|
115 | 111 |
def query_by_id(self, id): |
116 |
s = Search(using=self.client, index=self.index_name+"_object")
|
|
112 |
s = Search(using=self.client, index=self.index_name + "_object")
|
|
117 | 113 |
s = s.query(create_pid_query(id)) |
118 | 114 |
s.aggs.bucket('typologies', 'terms', field='typology') |
119 | 115 |
s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms', |
... | ... | |
162 | 158 |
publishers=publishers), hits=hits) |
163 | 159 |
|
164 | 160 |
def simple_query(self, textual_query, start=None, end=None, user_filter=None): |
165 |
s = Search(using=self.client, index=self.index_name+"_object")
|
|
166 |
if not textual_query == '*':
|
|
161 |
s = Search(using=self.client, index=self.index_name + "_object")
|
|
162 |
if not textual_query == '*': |
|
167 | 163 |
q = Q("multi_match", query=textual_query, fields=['title', 'abstract']) |
168 | 164 |
else: |
169 | 165 |
q = Q() |
... | ... | |
200 | 196 |
s = s[start:end] |
201 | 197 |
response = s.execute() |
202 | 198 |
|
203 |
|
|
204 |
|
|
205 | 199 |
hits = [] |
206 | 200 |
|
201 |
print(f"index : {self.index_name}_object") |
|
202 |
print(response.hits.total) |
|
203 |
|
|
207 | 204 |
for index_result in response.hits: |
208 |
input_source = index_result.__dict__['_d_']
|
|
209 |
fixed_titles = []
|
|
210 |
for ids in input_source.get('localIdentifier', []): |
|
211 |
ds = resolveIdentifier(ids['id'], ids['type']) |
|
212 |
ids['url'] = ds |
|
205 |
input_source = index_result.__dict__['_d_'] |
|
206 |
fixed_titles = [] |
|
207 |
# for ids in input_source.get('localIdentifier', []):
|
|
208 |
# ds = resolveIdentifier(ids['id'], ids['type'])
|
|
209 |
# ids['url'] = ds
|
|
213 | 210 |
|
214 |
if input_source.get('title', []) is not None:
|
|
211 |
if input_source.get('title', []) is not None: |
|
215 | 212 |
for t in input_source.get('title', []): |
216 | 213 |
if len(t) > 0 and t[0] == '"' and t[-1] == '"': |
217 | 214 |
fixed_titles.append(t[1:-1]) |
218 | 215 |
else: |
219 | 216 |
fixed_titles.append(t) |
220 | 217 |
else: |
221 |
fixed_titles.append("title not available")
|
|
218 |
fixed_titles.append("title not available") |
|
222 | 219 |
input_source['title'] = fixed_titles |
223 | 220 |
hits.append(input_source) |
224 |
|
|
221 |
|
|
225 | 222 |
pid_types = [] |
226 | 223 |
for tag in response.aggs.all_pids.all_types.buckets: |
227 | 224 |
pid_types.append(dict(key=tag.key, count=tag.doc_count)) |
... | ... | |
248 | 245 |
query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)])) |
249 | 246 |
args_id = {'source.dnetIdentifier': object_id} |
250 | 247 |
query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)])) |
251 |
s = Search(using=self.client).index(self.index_name+"_scholix").query(query_for_id & query_type)
|
|
248 |
s = Search(using=self.client).index(self.index_name + "_scholix").query(query_for_id & query_type)
|
|
252 | 249 |
if start: |
253 | 250 |
s = s[start:start + 10] |
254 |
|
|
251 |
|
|
255 | 252 |
response = s.execute() |
256 | 253 |
hits = [] |
257 | 254 |
for index_hit in response.hits: |
... | ... | |
261 | 258 |
for item in current_item['target']['identifier']: |
262 | 259 |
c_it = item |
263 | 260 |
c_it['url'] = resolveIdentifier(item['identifier'], item['schema']) |
264 |
ids .append(c_it)
|
|
261 |
ids.append(c_it) |
|
265 | 262 |
current_item['target']['identifier'] = ids |
266 | 263 |
hits.append(current_item) |
267 | 264 |
|
... | ... | |
271 | 268 |
if relation is None: |
272 | 269 |
return |
273 | 270 |
relSource = relation.get('source') |
274 |
collectedFrom = relSource.get('collectedFrom',[]) |
|
271 |
collectedFrom = relSource.get('collectedFrom', [])
|
|
275 | 272 |
if collectedFrom is not None: |
276 | 273 |
for coll in collectedFrom: |
277 | 274 |
for d in source['datasources']: |
... | ... | |
281 | 278 |
|
282 | 279 |
def item_by_id(self, id, type=None, start=None): |
283 | 280 |
try: |
284 |
res = self.client.get(index=self.index_name+"_object",doc_type="_all", id=id, _source=True)
|
|
281 |
res = self.client.get(index=self.index_name + "_object", doc_type="_all", id=id, _source=True)
|
|
285 | 282 |
hits = [] |
286 | 283 |
input_source = res['_source'] |
287 | 284 |
fixed_titles = [] |
288 |
for t in input_source.get('title',[]): |
|
285 |
for t in input_source.get('title', []):
|
|
289 | 286 |
if len(t) > 0 and t[0] == '"' and t[-1] == '"': |
290 | 287 |
fixed_titles.append(t[1:-1]) |
291 | 288 |
else: |
292 | 289 |
fixed_titles.append(t) |
293 | 290 |
input_source['title'] = fixed_titles |
294 | 291 |
|
295 |
for ids in input_source.get('localIdentifier', []): |
|
296 |
ds = resolveIdentifier(ids['id'], ids['type']) |
|
297 |
ids['url'] = ds |
|
298 | 292 |
related_publications = [] |
299 | 293 |
related_dataset = [] |
300 |
related_unknown = []
|
|
294 |
related_unknown = [] |
|
301 | 295 |
|
302 | 296 |
rel_source = None |
303 | 297 |
if input_source.get('relatedPublications') > 0: |
... | ... | |
310 | 304 |
else: |
311 | 305 |
rel_source = {} |
312 | 306 |
|
313 |
|
|
314 |
|
|
315 | 307 |
if input_source.get('relatedDatasets') > 0: |
316 | 308 |
if 'dataset' == type: |
317 | 309 |
related_dataset = self.related_type(id, 'dataset', start) |
... | ... | |
332 | 324 |
related_unknown=related_unknown)) |
333 | 325 |
|
334 | 326 |
return DLIESResponse(total=1, hits=hits) |
335 |
except Exception as e:
|
|
327 |
except Exception as e: |
|
336 | 328 |
log.error("Error on getting item ") |
337 |
log.error(e)
|
|
329 |
log.error(e) |
|
338 | 330 |
log.error("on line %i" % sys.exc_info) |
339 | 331 |
return DLIESResponse() |
Also available in: Unified diff
fixed scholexplorer to use the new datamodel