Project

General

Profile

1 46770 sandro.lab
from json import JSONEncoder
2 49837 sandro.lab
import sys
3 50064 sandro.lab
import re
4 46770 sandro.lab
from elasticsearch import Elasticsearch
5
from elasticsearch_dsl import *
6 56023 sandro.lab
import logging
7
from eu.dnetlib.util import get_index_properties
8 58542 sandro.lab
import traceback
9 46770 sandro.lab
10 49818 sandro.lab
import os
11
from os import path
12 46770 sandro.lab
13 56023 sandro.lab
14
log = logging.getLogger("scholexplorer-portal")
15
16 50035 sandro.lab
pid_resolver = {
17
    "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
18
    "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
19
    "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
20
    "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
21
    "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
22
    "doi": "http://dx.doi.org/%s",
23
    "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
24
    "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
25
    "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
26
    "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
27
    "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
28
    "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
29
    "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
30
    "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
31
    "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
32
    "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
33
    "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
34
    "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
35
    "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
36
    "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
37
    "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
38
    "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
39
}
40 49818 sandro.lab
41
42 50035 sandro.lab
def resolveIdentifier(pid, pid_type):
43 50064 sandro.lab
    if pid_type != None:
44
        regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b"
45
        if re.match(regex,pid):
46 56023 sandro.lab
            log.debug("It should be doi")
47 51703 sandro.lab
            pid_type = 'doi'
48 50064 sandro.lab
        if pid_type.lower() in pid_resolver:
49 50035 sandro.lab
            return pid_resolver[pid_type.lower()] % pid
50
        else:
51
            if pid_type.lower() == 'openaire':
52 50064 sandro.lab
                return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
53 51703 sandro.lab
            elif pid_type.lower() == 'url':
54
                return  pid
55 50035 sandro.lab
            else:
56
                return "http://identifiers.org/%s:%s" % (pid_type, pid)
57
    return ""
58
59 50064 sandro.lab
60 49818 sandro.lab
61
62 56023 sandro.lab
63 46770 sandro.lab
def create_typology_filter(value):
64
    return Q('match', typology=value)
65
66
67
def create_pid_type_filter(value):
68
    args = {'localIdentifier.type': value}
69
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
70
71 49818 sandro.lab
72 50064 sandro.lab
def create_pid_query(value):
73
    args = {'localIdentifier.id': value}
74
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))
75
76
77 47284 sandro.lab
def create_publisher_filter(value):
78
    return Q('match', publisher=value)
79 46770 sandro.lab
80 47284 sandro.lab
81 46770 sandro.lab
def create_datasource_filter(value):
82
    args = {'datasources.datasourceName': value}
83
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))
84
85
86
class DLIESResponseEncoder(JSONEncoder):
87
    def default(self, o):
88
        return o.__dict__
89
90
91
class DLIESResponse(object):
92
    def __init__(self, facet=None, total=0, hits=[]):
93
        if facet is None:
94
            facet = dict(pid=[], typology=[], datasource=[])
95
        self.facet = facet
96
        self.total = total
97
        self.hits = hits
98
99
100
class DLIESConnector(object):
101 49818 sandro.lab
    def __init__(self):
102 56023 sandro.lab
        props = get_index_properties()
103 49818 sandro.lab
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
104 58309 sandro.lab
        self.client = Elasticsearch(hosts=self.index_host, timeout=600000)
105 49818 sandro.lab
        self.index_name = props['api.index']
106 46770 sandro.lab
107 54602 sandro.lab
    def get_main_page_stats(self):
108 58309 sandro.lab
        stats = dict(total =int(Search(using=self.client, index=self.index_name+"_scholix").count()/2))
109 54602 sandro.lab
        for item in ['dataset', 'publication']:
110 58309 sandro.lab
            s= Search(using=self.client, index=self.index_name+"_object").query(Q('match', typology=item))
111 56023 sandro.lab
            stats[item] = s.count()
112 54602 sandro.lab
        return stats
113
114 50064 sandro.lab
    def query_by_id(self, id):
115 58309 sandro.lab
        s = Search(using=self.client, index=self.index_name+"_object")
116 50064 sandro.lab
        s = s.query(create_pid_query(id))
117
        s.aggs.bucket('typologies', 'terms', field='typology')
118
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
119
                                                                              field='datasources.datasourceName')
120
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
121
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
122
                                                                           field='localIdentifier.type')
123
        response = s.execute()
124
125
        hits = []
126
127
        for index_result in response.hits:
128
            input_source = index_result.__dict__['_d_']
129
            fixed_titles = []
130
131
            for ids in input_source.get('localIdentifier', []):
132
                ds = resolveIdentifier(ids['id'], ids['type'])
133
                ids['url'] = ds
134
            for t in input_source.get('title', []):
135
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
136
                    fixed_titles.append(t[1:-1])
137
                else:
138
                    fixed_titles.append(t)
139
            input_source['title'] = fixed_titles
140
            hits.append(input_source)
141
142
        pid_types = []
143
        for tag in response.aggs.all_pids.all_types.buckets:
144
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
145
146
        datasources = []
147
        for tag in response.aggs.all_datasources.all_names.buckets:
148
            datasources.append(dict(key=tag.key, count=tag.doc_count))
149
150
        typologies = []
151
        for tag in response.aggs.typologies.buckets:
152
            typologies.append(dict(key=tag.key, count=tag.doc_count))
153
154
        publishers = []
155
        for tag in response.aggs.all_publisher.buckets:
156
            if len(tag.key) > 0:
157
                publishers.append(dict(key=tag.key, count=tag.doc_count))
158
159
        return DLIESResponse(total=response.hits.total,
160
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
161
                                        publishers=publishers), hits=hits)
162
163 46770 sandro.lab
    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
164 58309 sandro.lab
        s = Search(using=self.client, index=self.index_name+"_object")
165 56023 sandro.lab
        if not textual_query  == '*':
166 58309 sandro.lab
            q = Q("multi_match", query=textual_query, fields=['title', 'abstract'])
167 56023 sandro.lab
        else:
168
            q = Q()
169 46770 sandro.lab
        s.aggs.bucket('typologies', 'terms', field='typology')
170
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
171
                                                                              field='datasources.datasourceName')
172 47284 sandro.lab
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
173 46770 sandro.lab
174
        filter_queries = []
175
        if user_filter is not None and len(user_filter) > 0:
176
            for f in user_filter.split('__'):
177
                filter_key = f.split('_')[0]
178
                filter_value = f.split('_')[1]
179
                if filter_key == 'typology':
180
                    filter_queries.append(create_typology_filter(filter_value))
181
                elif filter_key == 'datasource':
182
                    filter_queries.append(create_datasource_filter(filter_value))
183
                elif filter_key == 'pidtype':
184
                    filter_queries.append(create_pid_type_filter(filter_value))
185 47284 sandro.lab
                elif filter_key == 'publisher':
186
                    filter_queries.append(create_publisher_filter(filter_value))
187 46770 sandro.lab
188
        if len(filter_queries) > 0:
189
            s = s.query(q).filter(Q('bool', must=filter_queries))
190
        else:
191
            s = s.query(q)
192
193
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
194
                                                                           field='localIdentifier.type')
195
196
        if start is not None:
197
            if end is None:
198
                end = start + 10
199
            s = s[start:end]
200
        response = s.execute()
201
202 58542 sandro.lab
203 58309 sandro.lab
204 46770 sandro.lab
        hits = []
205
206
        for index_result in response.hits:
207 50035 sandro.lab
            input_source = index_result.__dict__['_d_']
208 58542 sandro.lab
209 50035 sandro.lab
            fixed_titles = []
210 46770 sandro.lab
211 50064 sandro.lab
            for ids in input_source.get('localIdentifier', []):
212 50035 sandro.lab
                ds = resolveIdentifier(ids['id'], ids['type'])
213
                ids['url'] = ds
214 50064 sandro.lab
            for t in input_source.get('title', []):
215 50035 sandro.lab
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
216
                    fixed_titles.append(t[1:-1])
217
                else:
218
                    fixed_titles.append(t)
219
            input_source['title'] = fixed_titles
220
            hits.append(input_source)
221
222 46770 sandro.lab
        pid_types = []
223
        for tag in response.aggs.all_pids.all_types.buckets:
224
            pid_types.append(dict(key=tag.key, count=tag.doc_count))
225
226
        datasources = []
227
        for tag in response.aggs.all_datasources.all_names.buckets:
228
            datasources.append(dict(key=tag.key, count=tag.doc_count))
229
230
        typologies = []
231
        for tag in response.aggs.typologies.buckets:
232
            typologies.append(dict(key=tag.key, count=tag.doc_count))
233
234 47284 sandro.lab
        publishers = []
235
        for tag in response.aggs.all_publisher.buckets:
236
            if len(tag.key) > 0:
237
                publishers.append(dict(key=tag.key, count=tag.doc_count))
238
239 58309 sandro.lab
        return DLIESResponse(total=s.count(),
240 49818 sandro.lab
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
241
                                        publishers=publishers), hits=hits)
242 46770 sandro.lab
243
    def related_type(self, object_id, object_type, start=None):
244
        args = {'target.objectType': object_type}
245
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
246
        args_id = {'source.dnetIdentifier': object_id}
247
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
248 58309 sandro.lab
        s = Search(using=self.client).index(self.index_name+"_scholix").query(query_for_id & query_type)
249 46770 sandro.lab
        if start:
250
            s = s[start:start + 10]
251
252
        response = s.execute()
253
        hits = []
254
255
        for index_hit in response.hits:
256 50064 sandro.lab
            current_item = index_hit.__dict__['_d_']
257
            if 'target' in current_item:
258
                ids = []
259
                for item in current_item['target']['identifier']:
260
                    c_it = item
261
                    c_it['url'] = resolveIdentifier(item['identifier'], item['schema'])
262
                    ids .append(c_it)
263
                current_item['target']['identifier'] = ids
264
            hits.append(current_item)
265 46770 sandro.lab
266
        return hits
267
268
    def fix_collectedFrom(self, source, relation):
269 50035 sandro.lab
        if relation is None:
270
            return
271 46770 sandro.lab
        relSource = relation.get('source')
272 57123 sandro.lab
        collectedFrom = relSource.get('collectedFrom',[])
273 58542 sandro.lab
        if collectedFrom is not None:
274
            for coll in collectedFrom:
275
                for d in source['datasources']:
276
                    if d['datasourceName'] == coll['provider']['name']:
277
                        d['provisionMode'] = coll['provisionMode']
278 46770 sandro.lab
        return source
279
280
    def item_by_id(self, id, type=None, start=None):
281
        try:
282 58542 sandro.lab
            res = self.client.get(index=self.index_name+"_object",doc_type="_all", id=id, _source=True)
283 46770 sandro.lab
            hits = []
284
            input_source = res['_source']
285 50035 sandro.lab
            fixed_titles = []
286 51575 sandro.lab
            for t in input_source.get('title',[]):
287 50064 sandro.lab
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
288 50035 sandro.lab
                    fixed_titles.append(t[1:-1])
289
                else:
290
                    fixed_titles.append(t)
291
            input_source['title'] = fixed_titles
292
293 50064 sandro.lab
            for ids in input_source.get('localIdentifier', []):
294 50035 sandro.lab
                ds = resolveIdentifier(ids['id'], ids['type'])
295
                ids['url'] = ds
296 46770 sandro.lab
            related_publications = []
297
            related_dataset = []
298
            related_unknown = []
299
300
            rel_source = None
301
            if input_source.get('relatedPublications') > 0:
302
                if 'publication' == type:
303
                    related_publications = self.related_type(id, 'publication', start)
304
                else:
305
                    related_publications = self.related_type(id, 'publication')
306 50064 sandro.lab
                if len(related_publications) > 0:
307 50035 sandro.lab
                    rel_source = related_publications[0]
308
                else:
309
                    rel_source = {}
310 46770 sandro.lab
            if input_source.get('relatedDatasets') > 0:
311
                if 'dataset' == type:
312
                    related_dataset = self.related_type(id, 'dataset', start)
313
                else:
314
                    related_dataset = self.related_type(id, 'dataset')
315
                rel_source = related_dataset[0]
316
            if input_source.get('relatedUnknown') > 0:
317
                if 'unknown' == type:
318
                    related_unknown = self.related_type(id, 'unknown', start)
319
                else:
320
                    related_unknown = self.related_type(id, 'unknown')
321
                rel_source = related_unknown[0]
322
323
            input_source = self.fix_collectedFrom(input_source, rel_source)
324
            hits.append(input_source)
325
326
            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
327
                             related_unknown=related_unknown))
328
329
            return DLIESResponse(total=1, hits=hits)
330 58542 sandro.lab
        except Exception as e:
331 56023 sandro.lab
            log.error("Error on getting item ")
332 58542 sandro.lab
            log.error(e)
333 57147 sandro.lab
            #log.error("on line %i" % sys.exc_info)
334 46770 sandro.lab
            return DLIESResponse()